190 files changed, 27963 insertions, 6686 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 094c1fa517..0295a9f8f7 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -3,11 +3,14 @@ OBJS                                   += x86/constants.o               \
 # subsystems
 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
-OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
+                                          x86/dirac_dwt_init.o
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
 OBJS-$(CONFIG_FMTCONVERT)              += x86/fmtconvert_init.o
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
 OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
@@ -15,11 +18,13 @@ OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
-OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
-OBJS-$(CONFIG_MDCT)                    += x86/mdct_init.o
 OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
@@ -36,44 +41,56 @@ OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 # decoders/encoders
-OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
-OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
+OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
+                                          x86/sbrdsp_init.o
+OBJS-$(CONFIG_AAC_ENCODER)             += x86/aacencdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
+OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
+OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
 OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
-OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc.o
-OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_TTA_ENCODER)             += x86/ttaencdsp_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
 OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP3_DECODER)             += x86/hpeldsp_vp3_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_10bpp.o      \
+                                          x86/vp9dsp_init_12bpp.o      \
+                                          x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
 
 
 # GCC inline assembly optimizations
 # subsystems
 MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
-MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
-                                          x86/hpeldsp_mmx.o
-MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
-                                          x86/simple_idct.o
-MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/simple_idct.o
 MMX-OBJS-$(CONFIG_VC1DSP)              += x86/vc1dsp_mmx.o
 
 # decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
-                                          x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
 
 # subsystems
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o                  \
                                           x86/ac3dsp_downmix.o
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
+YASM-OBJS-$(CONFIG_BLOCKDSP)           += x86/blockdsp.o
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
@@ -96,6 +113,11 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                           x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
+YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)      += x86/huffyuvencdsp.o
+YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
+YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
+YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
+YASM-OBJS-$(CONFIG_LLVIDENCDSP)        += x86/lossless_videoencdsp.o
 YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
@@ -104,27 +126,59 @@ YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                           x86/fpel.o                    \
                                           x86/qpel.o
 YASM-OBJS-$(CONFIG_RV34DSP)            += x86/rv34dsp.o
-YASM-OBJS-$(CONFIG_VC1DSP)             += x86/vc1dsp.o
+YASM-OBJS-$(CONFIG_VC1DSP)             += x86/vc1dsp_loopfilter.o       \
+                                          x86/vc1dsp_mc.o
+YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/simple_idct10.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP8DSP)             += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
 
 # decoders/encoders
-YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
-YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
-YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
+YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/aacpsdsp.o                \
+                                          x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_AAC_ENCODER)        += x86/aacencdsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ALAC_DECODER)       += x86/alacdsp.o
+YASM-OBJS-$(CONFIG_APNG_DECODER)       += x86/pngdsp.o
+YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o x86/synth_filter.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp.o                \
+                                          x86/dirac_dwt.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
+YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_FLAC_ENCODER)       += x86/flac_dsp_gpl.o
+endif
 YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_add_res.o            \
                                           x86/hevc_deblock.o            \
                                           x86/hevc_idct.o               \
-                                          x86/hevc_mc.o
+                                          x86/hevc_mc.o                 \
+                                          x86/hevc_sao.o                \
+                                          x86/hevc_sao_10bit.o
+YASM-OBJS-$(CONFIG_JPEG2000_DECODER)   += x86/jpeg2000dsp.o
+YASM-OBJS-$(CONFIG_MLP_DECODER)        += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_MPEG4_DECODER)      += x86/xvididct.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_SVQ1_ENCODER)       += x86/svq1enc.o
+YASM-OBJS-$(CONFIG_TAK_DECODER)        += x86/takdsp.o
+YASM-OBJS-$(CONFIG_TRUEHD_DECODER)     += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_TTA_DECODER)        += x86/ttadsp.o
+YASM-OBJS-$(CONFIG_TTA_ENCODER)        += x86/ttaencdsp.o
 YASM-OBJS-$(CONFIG_V210_ENCODER)       += x86/v210enc.o
+YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
 YASM-OBJS-$(CONFIG_VP3_DECODER)        += x86/hpeldsp_vp3.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
-YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9mc.o                   \
-                                          x86/vp9lpf.o
+YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_16bpp.o          \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9lpf_16bpp.o            \
+                                          x86/vp9mc.o                   \
+                                          x86/vp9mc_16bpp.o
+YASM-OBJS-$(CONFIG_WEBP_DECODER)       += x86/vp8dsp.o
diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
new file mode 100644
index 0000000000..97af571ec8
--- /dev/null
+++ b/libavcodec/x86/aacencdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD optimized AAC encoder DSP functions
+;*
+;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+float_abs_mask: times 4 dd 0x7fffffff
+
+SECTION .text
+
+;*******************************************************************
+;void ff_abs_pow34(float *out, const float *in, const int size);
+;*******************************************************************
+INIT_XMM sse
+cglobal abs_pow34, 3, 3, 3, out, in, size
+    mova   m2, [float_abs_mask]
+    shl    sizeq, 2
+    add    inq, sizeq
+    add    outq, sizeq
+    neg    sizeq
+.loop:
+    andps  m0, m2, [inq+sizeq]
+    sqrtps m1, m0
+    mulps  m0, m1
+    sqrtps m0, m0
+    mova   [outq+sizeq], m0
+    add    sizeq, mmsize
+    jl    .loop
+    RET
+
+;*******************************************************************
+;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
+;                           int size, int is_signed, int maxval, const float Q34,
+;                           const float rounding)
+;*******************************************************************
+INIT_XMM sse2
+cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if UNIX64 == 0
+    movss     m0, Q34m
+    movss     m1, roundingm
+    cvtsi2ss  m3, dword maxvalm
+%else
+    cvtsi2ss  m3, maxvald
+%endif
+    shufps    m0, m0, 0
+    shufps    m1, m1, 0
+    shufps    m3, m3, 0
+    shl       is_signedd, 31
+    movd      m4, is_signedd
+    shufps    m4, m4, 0
+    shl       sized,   2
+    add       inq, sizeq
+    add       outq, sizeq
+    add       scaledq, sizeq
+    neg       sizeq
+.loop:
+    mulps     m2, m0, [scaledq+sizeq]
+    addps     m2, m1
+    minps     m2, m3
+    andps     m5, m4, [inq+sizeq]
+    orps      m2, m5
+    cvttps2dq m2, m2
+    mova      [outq+sizeq], m2
+    add       sizeq, mmsize
+    jl       .loop
+    RET
diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c
new file mode 100644
index 0000000000..d761c3c5e6
--- /dev/null
+++ b/libavcodec/x86/aacencdsp_init.c
@@ -0,0 +1,43 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_sse(float *out, const float *in, const int size);
+
+void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
+                                int size, int is_signed, int maxval, const float Q34,
+                                const float rounding);
+
+av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags))
+        s->abs_pow34   = ff_abs_pow34_sse;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->quant_bands = ff_aac_quantize_bands_sse2;
+}
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
new file mode 100644
index 0000000000..e92cbbce08
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -0,0 +1,209 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+    shl    nd, 3
+    add  srcq, nq
+    neg    nq
+
+align 16
+.loop:
+    movaps m0, [srcq+nq]
+    movaps m1, [srcq+nq+mmsize]
+    mulps  m0, m0
+    mulps  m1, m1
+    HADDPS m0, m1, m2
+    addps  m0, [dstq]
+    movaps [dstq], m0
+    add  dstq, mmsize
+    add    nq, mmsize*2
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 2
+INIT_XMM sse3
+PS_ADD_SQUARES 3
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+;                                   float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
+    xor r4q, r4q
+
+.loop:
+    movu     m0, [src1q+r4q]
+    movu     m1, [src1q+r4q+mmsize]
+    mova     m2, [src2q]
+    mova     m3, m2
+    unpcklps m2, m2
+    unpckhps m3, m3
+    mulps    m0, m2
+    mulps    m1, m3
+    mova [dstq+r4q], m0
+    mova [dstq+r4q+mmsize], m1
+    add   src2q, mmsize
+    add     r4q, mmsize*2
+    sub      nd, mmsize/4
+    jg .loop
+    REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+;                                   float h[2][4], float h_step[2][4],
+;                                   int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [h_stepq]
+    cmp      nd, 0
+    jle .ret
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, m1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    movaps   m4, m0
+    movaps   m5, m0
+    unpcklps m4, m4
+    unpckhps m5, m5
+    mulps    m2, m4
+    mulps    m3, m5
+    addps    m2, m3
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+.ret:
+    REP_RET
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+;                                 const float (*filter)[8][2],
+;                                 int stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+    movu     %1, [inq+mmsize*%3]
+    movu     m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+    pshufd   %2, %1, q2301
+    pshufd   m4, m1, q0123
+    pshufd   m1, m1, q1032
+    pshufd   m2, [filterq+nq+mmsize*%3], q2301
+    addsubps %2, m4
+    addsubps %1, m1
+%else
+    mova     m2, [filterq+nq+mmsize*%3]
+    mova     %2, %1
+    mova     m4, m1
+    shufps   %2, %2, q2301
+    shufps   m4, m4, q0123
+    shufps   m1, m1, q1032
+    shufps   m2, m2, q2301
+    xorps    m4, m7
+    xorps    m1, m7
+    subps    %2, m4
+    subps    %1, m1
+%endif
+    mulps    %2, m2
+    mulps    %1, m2
+%if %3
+    addps    m3, %2
+    addps    m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    shl strided, 3
+    shl nd, 6
+    add filterq, nq
+    neg nq
+    mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+    pshufd   m3, m3, q2301
+    xorps    m0, m7
+    hsubps   m3, m0
+    pshufd   m1, m3, q0020
+    pshufd   m3, m3, q0031
+    addps    m1, m3
+    movsd    m2, [inq+6*8]
+%else
+    mova     m1, m3
+    mova     m2, m0
+    shufps   m1, m1, q2301
+    shufps   m2, m2, q2301
+    subps    m1, m3
+    addps    m2, m0
+    unpcklps m3, m1, m2
+    unpckhps m1, m2
+    addps    m1, m3
+    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+    movss    m3, [filterq+nq+8*6]
+    SPLATD   m3
+    mulps    m2, m3
+    addps    m1, m2
+    MOVH [outq], m1
+    add    outq, strideq
+    add      nq, 64
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
new file mode 100644
index 0000000000..f6d6c039c3
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -0,0 +1,55 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse  (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                int stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                int stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse;
+        s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse3;
+        s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
+    }
+}
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 817d5a319c..675ade3101 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 DSP functions
 ;* Copyright (c) 2011 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -32,7 +32,7 @@ pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 
 ; used in ff_ac3_extract_exponents()
-pd_1:   times 4 dd 1
+cextern pd_1
 pd_151: times 4 dd 151
 
 ; used in ff_apply_window_int16()
diff --git a/libavcodec/x86/ac3dsp_downmix.asm b/libavcodec/x86/ac3dsp_downmix.asm
index b085035ce3..057cc6061c 100644
--- a/libavcodec/x86/ac3dsp_downmix.asm
+++ b/libavcodec/x86/ac3dsp_downmix.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 downmixing
 ;* Copyright (c) 2012 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 6d049b37cf..2e7e2fb6da 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
  * x86-optimized AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/alacdsp.asm b/libavcodec/x86/alacdsp.asm
new file mode 100644
index 0000000000..bb2069f785
--- /dev/null
+++ b/libavcodec/x86/alacdsp.asm
@@ -0,0 +1,133 @@
+;******************************************************************************
+;* ALAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
+%else
+cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
+%define  buf1q  r2q
+%endif
+    movd    m6, shiftm
+    movd    m7, weightm
+    SPLATD  m7
+    shl   lend, 2
+    mov  buf1q, [buf0q + gprsize]
+    mov  buf0q, [buf0q]
+    add  buf1q, lenq
+    add  buf0q, lenq
+    neg  lenq
+
+align 16
+.loop:
+    mova    m0, [buf0q + lenq]
+    mova    m1, [buf0q + lenq + mmsize]
+    mova    m2, [buf1q + lenq]
+    mova    m3, [buf1q + lenq + mmsize]
+    pmulld  m4, m2, m7
+    pmulld  m5, m3, m7
+    psrad   m4, m6
+    psrad   m5, m6
+    psubd   m0, m4
+    psubd   m1, m5
+    paddd   m2, m0
+    paddd   m3, m1
+    mova [buf1q + lenq], m0
+    mova [buf1q + lenq + mmsize], m1
+    mova [buf0q + lenq], m2
+    mova [buf0q + lenq + mmsize], m3
+
+    add   lenq, mmsize*2
+    jl .loop
+    RET
+
+INIT_XMM sse2
+cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
+    movifnidn lend, lenm
+    movd      m4, r2m ; exbits
+    shl     lend, 2
+    mov    buf1q, [buf0q + gprsize]
+    mov    buf0q, [buf0q]
+    mov  exbuf1q, [exbuf0q + gprsize]
+    mov  exbuf0q, [exbuf0q]
+    add    buf1q, lenq
+    add    buf0q, lenq
+    add  exbuf1q, lenq
+    add  exbuf0q, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [buf0q + lenq]
+    mova      m1, [buf0q + lenq + mmsize]
+    pslld     m0, m4
+    pslld     m1, m4
+    mova      m2, [buf1q + lenq]
+    mova      m3, [buf1q + lenq + mmsize]
+    pslld     m2, m4
+    pslld     m3, m4
+    por       m0, [exbuf0q + lenq]
+    por       m1, [exbuf0q + lenq + mmsize]
+    por       m2, [exbuf1q + lenq]
+    por       m3, [exbuf1q + lenq + mmsize]
+    mova [buf0q + lenq         ], m0
+    mova [buf0q + lenq + mmsize], m1
+    mova [buf1q + lenq         ], m2
+    mova [buf1q + lenq + mmsize], m3
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
+
+%if ARCH_X86_64
+cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
+%else
+cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
+%define exbitsm r2m
+%endif
+    movifnidn lend, r4m
+    movd     m2, exbitsm
+    shl    lend, 2
+    mov    bufq, [bufq]
+    mov  exbufq, [exbufq]
+    add    bufq, lenq
+    add  exbufq, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [bufq + lenq]
+    mova      m1, [bufq + lenq + mmsize]
+    pslld     m0, m2
+    pslld     m1, m2
+    por       m0, [exbufq + lenq]
+    por       m1, [exbufq + lenq + mmsize]
+    mova [bufq + lenq], m0
+    mova [bufq + lenq + mmsize], m1
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/alacdsp_init.c b/libavcodec/x86/alacdsp_init.c
new file mode 100644
index 0000000000..de5dae6c77
--- /dev/null
+++ b/libavcodec/x86/alacdsp_init.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/alacdsp.h"
+#include "config.h"
+
+void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
+                                     int decorr_shift, int decorr_left_weight);
+void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                           int extra_bits, int channels, int nb_samples);
+void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                         int extra_bits, int channels, int nb_samples);
+
+av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
+        c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_stereo   = ff_alac_decorrelate_stereo_sse4;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index e038c18bd8..3973808ca5 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -2,20 +2,20 @@
 ;* optimized audio functions
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m1
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m2
-    paddd   m2, m0
-    pshuflw m0, m2, 0x4e
-%else
-    pshufw  m0, m2, 0x4e
-%endif
-    paddd   m2, m0
+    HADDD   m2, m0
     movd   eax, m2
+%if mmsize == 8
+    emms
+%endif
     RET
 %endmacro
 
@@ -144,7 +140,8 @@ cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
     VBROADCASTSS m0, minm
     VBROADCASTSS m1, maxm
 %elif WIN64
-    VBROADCASTSS m0, m3
+    SWAP 0, 3
+    VBROADCASTSS m0, m0
     VBROADCASTSS m1, maxm
 %else ; 64bit sysv
     VBROADCASTSS m0, m0
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index 093f3f0672..98e296c264 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,6 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                    int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                                int32_t min, int32_t max, unsigned int len);
-
 void ff_vector_clipf_sse(float *dst, const float *src,
                          int len, float min, float max);
 
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
new file mode 100644
index 0000000000..7cbfa3a843
--- /dev/null
+++ b/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+    ZERO  m0, m0
+%assign %%i 0
+%rep %2
+    mova  [blocksq+mmsize*(0+%%i)], m0
+    mova  [blocksq+mmsize*(1+%%i)], m0
+    mova  [blocksq+mmsize*(2+%%i)], m0
+    mova  [blocksq+mmsize*(3+%%i)], m0
+    mova  [blocksq+mmsize*(4+%%i)], m0
+    mova  [blocksq+mmsize*(5+%%i)], m0
+    mova  [blocksq+mmsize*(6+%%i)], m0
+    mova  [blocksq+mmsize*(7+%%i)], m0
+%assign %%i %%i+8
+%endrep
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 2
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+    add   blocksq, 768
+    mov      lenq, -768
+    ZERO       m0, m0
+.loop:
+    mova  [blocksq+lenq+mmsize*0], m0
+    mova  [blocksq+lenq+mmsize*1], m0
+    mova  [blocksq+lenq+mmsize*2], m0
+    mova  [blocksq+lenq+mmsize*3], m0
+    mova  [blocksq+lenq+mmsize*4], m0
+    mova  [blocksq+lenq+mmsize*5], m0
+    mova  [blocksq+lenq+mmsize*6], m0
+    mova  [blocksq+lenq+mmsize*7], m0
+    add   lenq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c
deleted file mode 100644
index 9bb5185b89..0000000000
--- a/libavcodec/x86/blockdsp.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/blockdsp.h"
-#include "libavcodec/version.h"
-
-#if HAVE_INLINE_ASM
-
-#define CLEAR_BLOCKS(name, n)                           \
-static void name(int16_t *blocks)                       \
-{                                                       \
-    __asm__ volatile (                                  \
-        "pxor %%mm7, %%mm7                 \n\t"        \
-        "mov     %1,        %%"FF_REG_a"   \n\t"        \
-        "1:                                \n\t"        \
-        "movq %%mm7,   (%0, %%"FF_REG_a")  \n\t"        \
-        "movq %%mm7,  8(%0, %%"FF_REG_a")  \n\t"        \
-        "movq %%mm7, 16(%0, %%"FF_REG_a")  \n\t"        \
-        "movq %%mm7, 24(%0, %%"FF_REG_a")  \n\t"        \
-        "add    $32, %%"FF_REG_a"          \n\t"        \
-        "js      1b                        \n\t"        \
-        :: "r"(((uint8_t *) blocks) + 128 * n),         \
-           "i"(-128 * n)                                \
-        : "%"FF_REG_a);                                 \
-}
-CLEAR_BLOCKS(clear_blocks_mmx, 6)
-CLEAR_BLOCKS(clear_block_mmx, 1)
-
-static void clear_block_sse(int16_t *block)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0          \n"
-        "movaps %%xmm0,    (%0)         \n"
-        "movaps %%xmm0,  16(%0)         \n"
-        "movaps %%xmm0,  32(%0)         \n"
-        "movaps %%xmm0,  48(%0)         \n"
-        "movaps %%xmm0,  64(%0)         \n"
-        "movaps %%xmm0,  80(%0)         \n"
-        "movaps %%xmm0,  96(%0)         \n"
-        "movaps %%xmm0, 112(%0)         \n"
-        :: "r" (block)
-        : "memory");
-}
-
-static void clear_blocks_sse(int16_t *blocks)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0                 \n"
-        "mov        %1,         %%"FF_REG_a"   \n"
-        "1:                                    \n"
-        "movaps %%xmm0,    (%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  16(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  32(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  48(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  64(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  80(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  96(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0, 112(%0, %%"FF_REG_a")  \n"
-        "add      $128,         %%"FF_REG_a"   \n"
-        "js         1b                         \n"
-        :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6)
-        : "%"FF_REG_a);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#if FF_API_XVMC
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
-                                  AVCodecContext *avctx)
-#else
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c)
-#endif /* FF_API_XVMC */
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->clear_block  = clear_block_mmx;
-        c->clear_blocks = clear_blocks_mmx;
-    }
-
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
-    if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)
-        return;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
-    if (INLINE_SSE(cpu_flags)) {
-        c->clear_block  = clear_block_sse;
-        c->clear_blocks = clear_blocks_sse;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
new file mode 100644
index 0000000000..21599934ff
--- /dev/null
+++ b/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+                                  AVCodecContext *avctx)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->clear_block  = ff_clear_block_mmx;
+        c->clear_blocks = ff_clear_blocks_mmx;
+    }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->clear_block  = ff_clear_block_sse;
+        c->clear_blocks = ff_clear_blocks_sse;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 4810867921..56d8083622 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -1,21 +1,23 @@
 ;******************************************************************************
 ;* optimized bswap buffer functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -24,6 +26,8 @@
 SECTION_RODATA
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+cextern pb_80
+
 SECTION .text
 
 ; %1 = aligned/unaligned
@@ -84,11 +88,14 @@ SECTION .text
 %macro BSWAP32_BUF 0
 %if cpuflag(ssse3)
 cglobal bswap32_buf, 3,4,3
+    mov      r3, r1
     mova     m2, [pb_bswap32]
 %else
 cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
 %endif
-    test     r1, 15
+    or       r3, r0
+    test     r3, 15
     jz       .start_align
     BSWAP_LOOPS  u
     jmp      .left
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index ba40f2dbe1..c042e56371 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 205511ef61..cfd3b759c9 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,28 @@
 #include "libavutil/x86/asm.h"
 #include "config.h"
 
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
 #if HAVE_INLINE_ASM
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
 #ifdef BROKEN_RELOCATIONS
 #define TABLES_ARG , "r"(tables)
 
@@ -73,11 +93,10 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         "jnz    2f                                                      \n\t"\
         "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
-        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
-        "movzwl (%%"FF_REG_c"), "tmp"                                   \n\t"\
+        "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
         "lea    -1("low")   , %%ecx                                     \n\t"\
         "xor    "low"       , %%ecx                                     \n\t"\
         "shr    $15         , %%ecx                                     \n\t"\
@@ -92,7 +111,8 @@
         "2:                                                             \n\t"
 
 #else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
 
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +154,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   2f                                                      \n\t"\
         "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
-        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
@@ -154,8 +173,7 @@
 
 #endif /* BROKEN_RELOCATIONS */
 
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
@@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                              AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%8")
-        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
         : "r"(state), "r"(c),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end))
           TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
         : "%"FF_REG_c, "memory"
     );
     return bit & 1;
 }
-#endif /* HAVE_7REGS */
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
 
+#if !BROKEN_COMPILER
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
@@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "xor           %%edx, %%ecx     \n\t"
@@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "movzwl         (%1), %%edx     \n\t"
         "bswap         %%edx            \n\t"
         "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
         "addl          %%edx, %%eax     \n\t"
         "cmp         %c5(%2), %1        \n\t"
         "jge              1f            \n\t"
         "add"FF_OPSIZE"   $2, %c4(%2)   \n\t"
+#endif
         "1:                             \n\t"
         "movl          %%eax, %c3(%2)   \n\t"
 
@@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "inc           %%edx            \n\t"
@@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
     );
     return res;
 }
+#endif /* !BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index b5406ef16d..ecb9b231f1 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
  * MMX-optimized DSP functions, based on H.264 optimizations by
  * Michael Niedermayer and Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,7 +139,7 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
 {
     int i;
-    DECLARE_ALIGNED(8, int16_t, b2)[64];
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
 
     for(i=0; i<2; i++){
         cavs_idct8_1d(block + 4 * i, ff_pw_4.a);
@@ -196,7 +196,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         );
     }
 
-    ff_add_pixels_clamped_mmx(b2, dst, stride);
+    ff_add_pixels_clamped(b2, dst, stride);
 }
 
 #endif /* HAVE_MMX_INLINE */
@@ -210,10 +210,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
  ****************************************************************************/
 
 /* vertical filter [-1 -2 96 42 -7  0]  */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
         "psllw $3, "#E"             \n\t"\
@@ -228,35 +228,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         "psubw "#B", %%mm6          \n\t"\
         "psraw $1, "#B"             \n\t"\
         "psubw "#A", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -1  5  5 -1  0]  */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "paddw "#D", %%mm6          \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "add %2, %0                 \n\t"\
         "punpcklbw %%mm7, "#F"      \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psubw "#E", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $3, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -7 42 96 -2 -1]  */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
-        "pmullw %5, %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
         "psllw $3, "#B"             \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psraw $3, "#B"             \n\t"\
@@ -269,7 +269,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         "psubw "#E", %%mm6          \n\t"\
         "psraw $1, "#E"             \n\t"\
         "psubw "#F", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
@@ -298,32 +298,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         "punpcklbw %%mm7, %%mm2     \n\t"\
         "punpcklbw %%mm7, %%mm3     \n\t"\
         "punpcklbw %%mm7, %%mm4     \n\t"\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
         \
         : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
         : "memory"\
      );\
      if(h==16){\
         __asm__ volatile(\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
             \
            : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
            : "memory"\
         );\
      }\
@@ -337,7 +339,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
     int h=8;\
     __asm__ volatile(\
         "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
         "1:                         \n\t"\
         "movq    (%0), %%mm0        \n\t"\
         "movq   1(%0), %%mm2        \n\t"\
@@ -363,7 +365,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
         "paddw %%mm3, %%mm5         \n\t"\
         "psubw %%mm2, %%mm0         \n\t"\
         "psubw %%mm5, %%mm1         \n\t"\
-        "movq %6, %%mm5             \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
         "paddw %%mm5, %%mm0         \n\t"\
         "paddw %%mm5, %%mm1         \n\t"\
         "psraw $3, %%mm0            \n\t"\
@@ -375,7 +377,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
         : "memory"\
     );\
 }\
@@ -387,7 +390,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8
 \
 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
 {                                                                       \
-  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
@@ -468,7 +471,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 
 #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
 
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
 static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                     ptrdiff_t stride)
 {
@@ -481,6 +484,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels8_mmx(dst, src, stride, 8);
 }
 
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
 static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                      ptrdiff_t stride)
 {
@@ -493,18 +502,40 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels16_mmx(dst, src, stride, 16);
 }
 
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
 static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
                                      AVCodecContext *avctx)
 {
+#if HAVE_MMX_EXTERNAL
     c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
     c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+#endif
 
+#if HAVE_MMX_INLINE
     c->cavs_idct8_add = cavs_idct8_add_mmx;
     c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
-}
 #endif /* HAVE_MMX_INLINE */
+}
 
 #define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
     c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -520,15 +551,6 @@ CAVS_MC(put_,  8, mmxext)
 CAVS_MC(put_, 16, mmxext)
 CAVS_MC(avg_,  8, mmxext)
 CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
-                                        AVCodecContext *avctx)
-{
-    DSPFUNC(put, 0, 16, mmxext);
-    DSPFUNC(put, 1,  8, mmxext);
-    DSPFUNC(avg, 0, 16, mmxext);
-    DSPFUNC(avg, 1,  8, mmxext);
-}
 #endif /* HAVE_MMXEXT_INLINE */
 
 #if HAVE_AMD3DNOW_INLINE
@@ -552,18 +574,33 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
 
 av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
 {
-#if HAVE_MMX_INLINE
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (X86_MMX(cpu_flags))
         cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+
 #if HAVE_AMD3DNOW_INLINE
     if (INLINE_AMD3DNOW(cpu_flags))
         cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_AMD3DNOW_INLINE */
 #if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags))
-        cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+    }
+#endif
 }
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 6f7dd7346e..11002ee61e 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
 /*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,12 +22,13 @@
 #include "libavutil/x86/asm.h" // for xmm_reg
 #include "constants.h"
 
-DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+                                                    0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+                                                    0x0002000200020002ULL, 0x0002000200020002ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+                                                    0x0004000400040004ULL, 0x0004000400040004ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
@@ -35,22 +36,58 @@ DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+                                                    0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
                                                     0x0100010001000100ULL, 0x0100010001000100ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+                                                    0x0200020002000200ULL, 0x0200020002000200ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+                                                    0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+                                                    0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+                                                    0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+                                                    0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+                                                    0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+                                                    0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
 
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+                                                    0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+                                                    0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+                                                    0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+                                                    0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+                                                    0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+                                                    0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+                                                    0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+                                                    0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 59ff94725d..bbb0ef844a 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
 /*
  * MMX/SSE constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,30 +25,48 @@
 
 #include "libavutil/x86/asm.h"
 
-extern const uint64_t ff_wtwo;
-
+extern const ymm_reg  ff_pw_1;
+extern const ymm_reg  ff_pw_2;
 extern const xmm_reg  ff_pw_3;
-extern const xmm_reg  ff_pw_4;
+extern const ymm_reg  ff_pw_4;
 extern const xmm_reg  ff_pw_5;
 extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
 extern const uint64_t ff_pw_15;
 extern const xmm_reg  ff_pw_16;
 extern const xmm_reg  ff_pw_18;
-extern const uint64_t ff_pw_20;
+extern const xmm_reg  ff_pw_20;
 extern const xmm_reg  ff_pw_32;
 extern const uint64_t ff_pw_42;
 extern const uint64_t ff_pw_53;
 extern const xmm_reg  ff_pw_64;
 extern const uint64_t ff_pw_96;
 extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
+extern const ymm_reg  ff_pw_255;
 extern const ymm_reg  ff_pw_256;
-extern const xmm_reg  ff_pw_512;
-extern const xmm_reg  ff_pw_m1;
+extern const ymm_reg  ff_pw_512;
+extern const ymm_reg  ff_pw_1023;
+extern const ymm_reg  ff_pw_1024;
+extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
+extern const ymm_reg  ff_pw_4096;
+extern const ymm_reg  ff_pw_8192;
+extern const ymm_reg  ff_pw_m1;
 
-extern const xmm_reg  ff_pb_1;
-extern const xmm_reg  ff_pb_3;
-extern const xmm_reg  ff_pb_F8;
+extern const ymm_reg  ff_pb_0;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_2;
+extern const ymm_reg  ff_pb_3;
+extern const xmm_reg  ff_pb_80;
+extern const ymm_reg  ff_pb_FE;
 extern const uint64_t ff_pb_FC;
 
+extern const xmm_reg  ff_ps_neg;
+
+extern const ymm_reg  ff_pd_1;
+extern const ymm_reg  ff_pd_16;
+extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_8192;
+extern const ymm_reg  ff_pd_65535;
+
 #endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 11d45ae61c..0000000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DCA_H
-#define AVCODEC_X86_DCA_H
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
-
-#endif /* AVCODEC_X86_DCA_H */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fa8d3cb66a..055361a765 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -1,336 +1,301 @@
 ;******************************************************************************
-;* SSE-optimized functions for the DCA decoder
-;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-pf_inv16:  times 4 dd 0x3D800000 ; 1/16
-
 SECTION .text
 
-; %1=v0/v1  %2=in1  %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va          m1
-%define vb          m2
-%if %1
-%define OFFSET      0
-%else
-%define OFFSET      NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
-    mova        va, [cf0q + OFFSET]
-    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
-    mova        m4, [cf0q + OFFSET + mmsize]
-    mova        m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
-    mulps       va, %2
-    mulps       vb, %2
-%if %0 == 3
-    mulps       m4, %3
-    mulps       m0, %3
-    addps       va, m4
-    addps       vb, m0
-%endif
-    ; va = va1 va2 va3 va4
-    ; vb = vb1 vb2 vb3 vb4
-%if %1
-    SWAP        va, vb
-%endif
-    mova        m4, va
-    unpcklps    va, vb ; va3 vb3 va4 vb4
-    unpckhps    m4, vb ; va1 vb1 va2 vb2
-    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
-    movhlps     vb, m4 ; va1+3  vb1+3
-    addps       vb, m4 ; va0..4 vb0..4
-    movlps  [outq + count], vb
-%if %1
-    sub       cf0q, 8*NUM_COEF
-%endif
-    add      count, 8
-    jl   .loop%1
-%endmacro
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3))
 
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1       m3
-%define IN2       m5
-%define count     inq
-%define NUM_COEF  4*(2-%1)
-%define NUM_OUT   32*(%1+1)
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 1
+    sub     lfeq, 7*sizeof_float
+    mov    cnt1d, 32*sizeof_float
+    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+    lea   coeffq, [coeffq+cnt1q*8]
+    add samplesq, cnt1q
+    neg    cnt1q
 
-    movu     IN1, [inq + 4 - 1*mmsize]
-    shufps   IN1, IN1, q0123
-%if %1 == 0
-    movu     IN2, [inq + 4 - 2*mmsize]
-    shufps   IN2, IN2, q0123
-%endif
-
-    mov    count, -4*NUM_OUT
-    add     cf0q, 4*NUM_COEF*NUM_OUT
-    add     outq, 4*NUM_OUT
-    ; compute v0 first
-%if %1 == 0
-    FIR_LOOP   0, IN1, IN2
-%else
-    FIR_LOOP   0, IN1
-%endif
-    shufps   IN1, IN1, q0123
-    mov    count, -4*NUM_OUT
-    ; cf1 already correctly positioned
-    add     outq, 4*NUM_OUT          ; outq now at out2
-    sub     cf0q, 8*NUM_COEF
-%if %1 == 0
-    shufps   IN2, IN2, q0123
-    FIR_LOOP   1, IN2, IN1
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq+16]
+    cvtdq2ps  m5, [lfeq   ]
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq+16]
+    movu      m5, [lfeq   ]
+    cvtdq2ps  m4, m4
+    cvtdq2ps  m5, m5
+    pshufd    m7, m4, q0123
+    pshufd    m6, m5, q0123
 %else
-    FIR_LOOP   1, IN1
+    cvtpi2ps  m4, [lfeq+16]
+    cvtpi2ps  m0, [lfeq+24]
+    cvtpi2ps  m5, [lfeq   ]
+    cvtpi2ps  m1, [lfeq+8 ]
+    shufps    m4, m0, q1010
+    shufps    m5, m1, q1010
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
 %endif
-    RET
-%endmacro
 
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
+.inner_loop:
+%if ARCH_X86_64
+    movaps    m8, [coeffq+cnt1q*8   ]
+    movaps    m9, [coeffq+cnt1q*8+16]
+    movaps   m10, [coeffq+cnt1q*8+32]
+    movaps   m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+    movaps   m12, [coeffq+cnt1q*8+64]
+    movaps   m13, [coeffq+cnt1q*8+80]
+    movaps   m14, [coeffq+cnt1q*8+96]
+    movaps   m15, [coeffq+cnt1q*8+112]
+    mulps     m0, m7, m8
+    mulps     m1, m7, m10
+    mulps     m2, m7, m12
+    mulps     m3, m7, m14
+    fmaddps   m0, m6, m9, m0
+    fmaddps   m1, m6, m11, m1
+    fmaddps   m2, m6, m13, m2
+    fmaddps   m3, m6, m15, m3
 
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
-    pxor          %1, %1
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
 %else
-    xorps         %1, %1, %1
-%endif
-%endmacro
+    mulps     m0, m7, m8
+    mulps     m1, m6, m9
+    mulps     m2, m7, m10
+    mulps     m3, m6, m11
+    addps     m0, m1
+    addps     m2, m3
 
-%macro SHUF 3
-%if cpuflag(avx)
-    mova          %3, [%2 - 16]
-    vperm2f128    %1, %3, %3, 1
-    vshufps       %1, %1, %1, q0123
-%elif cpuflag(sse2)
-    pshufd        %1, [%2], q0123
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m7, [coeffq+cnt1q*8    ]
+    mulps     m1, m7, [coeffq+cnt1q*8+32 ]
+    mulps     m2, m7, [coeffq+cnt1q*8+64 ]
+    mulps     m3, m7, [coeffq+cnt1q*8+96 ]
+    fmaddps   m0, m6, [coeffq+cnt1q*8+16 ], m0
+    fmaddps   m1, m6, [coeffq+cnt1q*8+48 ], m1
+    fmaddps   m2, m6, [coeffq+cnt1q*8+80 ], m2
+    fmaddps   m3, m6, [coeffq+cnt1q*8+112], m3
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
 %else
-    mova          %1, [%2]
-    shufps        %1, %1, q0123
-%endif
-%endmacro
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    mulps     m1, m6, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    mulps     m3, m6, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%endif; ARCH
 
-%macro INNER_LOOP   1
-    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
-    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
-    ;~ b += window[i + j + 16] * (synth_buf[i + j])
-    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
-    mova          m6, [ptr1 + j]
-%if ARCH_X86_64
-    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
-    mova         m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
-    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
-    fnmaddps      m1, m5,  [win + %1 + j], m1
-%if ARCH_X86_64
-    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
-    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
-    mulps         m6, m6,  [win + %1 + j + 16 * 4]
-    mulps         m5, m5,  [win + %1 + j]
-%if ARCH_X86_64
-    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
-    mulps        m11, m11, [win + %1 + j + mmsize]
-%endif
-    addps         m2, m2, m6
-    subps         m1, m1, m5
-%if ARCH_X86_64
-    addps         m8, m8, m12
-    subps         m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
-    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
-    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
-    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
-    mova          m5, [ptr1 + j + 16 * 4]
 %if ARCH_X86_64
-    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
-    mova         m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
 %if cpuflag(fma3)
-    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
-    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
-    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
-    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
-    mulps         m5, m5,  [win + %1 + j + 32 * 4]
-    mulps         m6, m6,  [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
-    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
-    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
-    addps         m3, m3, m5
-    addps         m4, m4, m6
-%if ARCH_X86_64
-    addps         m9, m9, m11
-    addps        m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
-    sub            j, 64 * 4
-%endmacro
+    mulps     m8, m5
+    mulps    m10, m5
+    mulps    m12, m5
+    mulps    m14, m5
+    fmaddps   m8, m4, m9, m8
+    fmaddps  m10, m4, m11, m10
+    fmaddps  m12, m4, m13, m12
+    fmaddps  m14, m4, m15, m14
 
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-;                                  const float window[512], float out[32],
-;                                  intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
-                              synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
-    movd       scale, scalem
-    SPLATD        m0
-%else
-    VBROADCASTSS  m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ  r4q
+    haddps   m10, m8
+    haddps   m14, m12
+    haddps   m14, m10
+    movaps [samplesq+cnt2q], m14
 %else
-    SPLATD      xmm0
-%if cpuflag(avx)
-    vinsertf128   m0, m0, xmm0, 1
-%endif
-%define OFFQ  offq
-%endif
-    ; prepare inner counter limit 1
-    mov          r5q, 480
-    sub          r5q, offmp
-    and          r5q, -64
-    shl          r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
-    mov         OFFQ, r5q
-%define i        r5q
-    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+    mulps     m8, m5
+    mulps     m9, m4
+    mulps    m10, m5
+    mulps    m11, m4
+    addps     m8, m9
+    addps    m10, m11
+
+    unpckhps m11, m10, m8
+    unpcklps m10, m8
+    addps    m11, m10
+    movhlps   m8, m11
+    addps     m8, m11
+    movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m5, [coeffq+cnt1q*8    ]
+    mulps     m1, m5, [coeffq+cnt1q*8+32 ]
+    mulps     m2, m5, [coeffq+cnt1q*8+64 ]
+    mulps     m3, m5, [coeffq+cnt1q*8+96 ]
+    fmaddps   m0, m4, [coeffq+cnt1q*8+16 ], m0
+    fmaddps   m1, m4, [coeffq+cnt1q*8+48 ], m1
+    fmaddps   m2, m4, [coeffq+cnt1q*8+80 ], m2
+    fmaddps   m3, m4, [coeffq+cnt1q*8+112], m3
+
+    haddps    m1, m0
+    haddps    m3, m2
+    haddps    m3, m1
+    movaps [samplesq+cnt2q], m3
 %else
-%define i 0
-%define OFFQ  r5q
-%endif
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m1, m4, [coeffq+cnt1q*8+16]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    mulps     m3, m4, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m2, m0
+    unpcklps  m2, m0
+    addps     m3, m2
+    movhlps   m0, m3
+    addps     m0, m3
+    movlps [samplesq+cnt2q], m0
+%endif
+%endif; ARCH
+
+    sub    cnt2d, 8 + FMA3_OFFSET
+    add    cnt1q, 8 + FMA3_OFFSET
+    jl .inner_loop
+
+    add     lfeq, 4
+    add samplesq,  64*sizeof_float
+    mov    cnt1q, -32*sizeof_float
+    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
 
-%define buf2     synth_buf2q
-%if ARCH_X86_32
-    mov         buf2, synth_buf2mp
-%endif
-.mainloop:
-    ; m1 = a  m2 = b  m3 = c  m4 = d
-    SETZERO       m3
-    SETZERO       m4
-    mova          m1, [buf2 + i]
-    mova          m2, [buf2 + i + 16 * 4]
 %if ARCH_X86_32
-%define ptr1     r0q
-%define ptr2     r1q
-%define win      r2q
-%define j        r3q
-    mov          win, windowm
-    mov         ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
-    add          win, i
-    add         ptr1, i
+INIT_XMM sse
+LFE_FIR0_FLOAT
 %endif
-%else ; ARCH_X86_64
-%define ptr1     r6q
-%define ptr2     r7q ; must be loaded
-%define win      r8q
-%define j        r9q
-    SETZERO       m9
-    SETZERO      m10
-    mova          m7, [buf2 + i + mmsize]
-    mova          m8, [buf2 + i + mmsize + 16 * 4]
-    lea          win, [windowq + i]
-    lea         ptr1, [synth_bufq + i]
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
 %endif
-    mov         ptr2, synth_bufmp
-    ; prepare the inner loop counter
-    mov            j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub         ptr2, i
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
 %endif
-.loop1:
-    INNER_LOOP  0
-    jge       .loop1
 
-    mov            j, 448 * 4
-    sub            j, OFFQ
-    jz          .end
-    sub         ptr1, j
-    sub         ptr2, j
-    add          win, OFFQ ; now at j-64, so define OFFSET
-    sub            j, 64 * 4
-.loop2:
-    INNER_LOOP  64 * 4
-    jge       .loop2
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 2
+    sub     lfeq, 3*sizeof_float
+    mov    cnt1d, 64*sizeof_float
+    mov    cnt2d, 64*sizeof_float-16
+    lea   coeffq, [coeffq+cnt1q*4]
+    add samplesq, cnt1q
+    neg    cnt1q
 
-.end:
-%if ARCH_X86_32
-    mov         buf2, synth_buf2m ; needed for next iteration anyway
-    mov         outq, outmp       ; j, which will be set again during it
-%endif
-    ;~ out[i]      = a * scale;
-    ;~ out[i + 16] = b * scale;
-    mulps         m1, m1, scale
-    mulps         m2, m2, scale
-%if ARCH_X86_64
-    mulps         m7, m7, scale
-    mulps         m8, m8, scale
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq]
+    shufps    m5, m4, m4, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq]
+    cvtdq2ps  m4, m4
+    pshufd    m5, m4, q0123
 %endif
-    ;~ synth_buf2[i]      = c;
-    ;~ synth_buf2[i + 16] = d;
-    mova   [buf2 + i +  0 * 4], m3
-    mova   [buf2 + i + 16 * 4], m4
+
+.inner_loop:
+    movaps    m6, [coeffq+cnt1q*4   ]
+    movaps    m7, [coeffq+cnt1q*4+16]
+    mulps     m0, m5, m6
+    mulps     m1, m5, m7
 %if ARCH_X86_64
-    mova   [buf2 + i +  0 * 4 + mmsize], m9
-    mova   [buf2 + i + 16 * 4 + mmsize], m10
+    movaps    m8, [coeffq+cnt1q*4+32]
+    movaps    m9, [coeffq+cnt1q*4+48]
+    mulps     m2, m5, m8
+    mulps     m3, m5, m9
+%else
+    mulps     m2, m5, [coeffq+cnt1q*4+32]
+    mulps     m3, m5, [coeffq+cnt1q*4+48]
 %endif
-    ;~ out[i]      = a;
-    ;~ out[i + 16] = a;
-    mova   [outq + i +  0 * 4], m1
-    mova   [outq + i + 16 * 4], m2
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+
+    mulps     m6, m4
+    mulps     m7, m4
 %if ARCH_X86_64
-    mova   [outq + i +  0 * 4 + mmsize], m7
-    mova   [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub            i, (ARCH_X86_64 + 1) * mmsize
-    jge    .mainloop
+    mulps     m8, m4
+    mulps     m9, m4
+
+    haddps    m6, m7
+    haddps    m8, m9
+    haddps    m6, m8
+%else
+    mulps     m2, m4, [coeffq+cnt1q*4+32]
+    mulps     m3, m4, [coeffq+cnt1q*4+48]
+
+    haddps    m6, m7
+    haddps    m2, m3
+    haddps    m6, m2
 %endif
+    movaps [samplesq+cnt2q], m6
+
+    sub    cnt2d, 16
+    add    cnt1q, 16
+    jl .inner_loop
+
+    add     lfeq, sizeof_float
+    add samplesq, 128*sizeof_float
+    mov    cnt1q, -64*sizeof_float
+    mov    cnt2d,  64*sizeof_float-16
+    sub nblocksd, 1
+    jg .loop
     RET
 %endmacro
 
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
 %endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 8632c4a98f..fc10fb8bc5 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,18 @@
 /*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,66 +21,32 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
-
-av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
-        s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
-    }
-}
-
+#define LFE_FIR_FLOAT_FUNC(opt)                                               \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
 
-#define SYNTH_FILTER_FUNC(opt)                                                 \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
-                                 const float window[512],                      \
-                                 float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct,                              \
-                               float *synth_buf_ptr, int *synth_buf_offset,    \
-                               float synth_buf2[32], const float window[512],  \
-                               float out[32], const float in[32], float scale) \
-{                                                                              \
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
-                                                                               \
-    imdct->imdct_half(imdct, synth_buf, in);                                   \
-                                                                               \
-    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
-                                out, *synth_buf_offset, scale);                \
-                                                                               \
-    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
-}                                                                              \
+LFE_FIR_FLOAT_FUNC(sse)
+LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
+LFE_FIR_FLOAT_FUNC(avx)
+LFE_FIR_FLOAT_FUNC(fma3)
 
-#if HAVE_YASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_YASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 {
-#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse;
-    }
-#endif
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse2;
-    }
-    if (EXTERNAL_AVX_FAST(cpu_flags)) {
-        s->synth_filter_float = synth_filter_avx;
-    }
-    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
-        s->synth_filter_float = synth_filter_fma3;
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+    if (EXTERNAL_AVX(cpu_flags)) {
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
     }
-#endif /* HAVE_YASM */
+    if (EXTERNAL_FMA3(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
 }
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 2c4c32eb11..4e657b5460 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
 ;* 32 point SSE-optimized DCT transform
 ;* Copyright (c) 2010 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
 INIT_YMM avx
 SECTION .text
+%if HAVE_AVX_EXTERNAL
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
 cglobal dct32_float, 2,3,8, out, in, tmp
     ; pass 1
@@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
 INIT_XMM
     PASS6_AND_PERMUTE
     RET
+%endif
 
 %if ARCH_X86_64
 %define SPILL SWAP
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index b2e43a9be1..c31ef92238 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dirac_dwt.asm b/libavcodec/x86/dirac_dwt.asm
new file mode 100644
index 0000000000..89806899a2
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* x86 optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/dirac_dwt_init.c b/libavcodec/x86/dirac_dwt_init.c
new file mode 100644
index 0000000000..afdf0a1415
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_init.c
@@ -0,0 +1,229 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                           uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                          uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    int16_t *b = (int16_t *)_b;
+    int16_t *tmp = (int16_t *)_tmp;
+
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
new file mode 100644
index 0000000000..6b3f780e41
--- /dev/null
+++ b/libavcodec/x86/diracdsp.asm
@@ -0,0 +1,347 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit:                times 8 dw 0x3ff
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+section .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_80]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   dst_strideq, dst_strided
+    movsxd   src_strideq, src_strided
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy:
+    lea     src2q, [srcq+src_strideq]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*2]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   strideq, strided
+    movsxd   idwt_strideq, idwt_strided
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
+    movd   m2, qfd
+    movd   m3, qsd
+    SPLATD m2
+    SPLATD m3
+    mov    r4, tot_hq
+    mov    r3, dstq
+
+    .loop_v:
+    mov    tot_hq, r4
+    mov    dstq,   r3
+
+    .loop_h:
+    movu   m0, [srcq]
+
+    pabsd  m1, m0
+    pmulld m1, m2
+    paddd  m1, m3
+    psrld  m1,  2
+    psignd m1, m0
+
+    movu   [dstq], m1
+
+    add    srcq, mmsize
+    add    dstq, mmsize
+    sub    tot_hd, 4
+    jg     .loop_h
+
+    add    r3, strideq
+    dec    tot_vd
+    jg     .loop_v
+
+    RET
+
+INIT_XMM sse4
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+%if ARCH_X86_64
+cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
+%else
+cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
+    %define  hd  r5mp
+%endif
+    shl      wd, 2
+    add    srcq, wq
+    neg      wq
+    mov     t2q, dstq
+    mov     t1q, wq
+    pxor     m2, m2
+    mova     m3, [clip_10bit]
+    mova     m4, [convert_to_unsigned_10bit]
+
+    .loop_h:
+    mov    dstq, t2q
+    mov      wq, t1q
+
+    .loop_w:
+    movu     m0, [srcq+wq+0*mmsize]
+    movu     m1, [srcq+wq+1*mmsize]
+
+    paddd    m0, m4
+    paddd    m1, m4
+    packusdw m0, m0, m1
+    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
+
+    movu     [dstq], m0
+
+    add      dstq, 1*mmsize
+    add      wq,   2*mmsize
+    jl       .loop_w
+
+    add    srcq, src_strideq
+    add     t2q, dst_strideq
+    sub      hd, 1
+    jg       .loop_h
+
+    RET
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
new file mode 100644
index 0000000000..b195113789
--- /dev/null
+++ b/libavcodec/x86/diracdsp_init.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/diracdsp.h"
+#include "fpel.h"
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
+#if HAVE_YASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+#else // HAVE_YASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                     \
+    void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,              \
+                                   const uint8_t *src, int stride, int width, int height);
+
+#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
+
+#endif // HAVE_YASM
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+void ff_diracdsp_init_x86(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+        c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+        c->add_rect_clamped = ff_add_rect_clamped_mmx;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
+#endif
+        PIXFUNC(put, 0, mmx);
+        PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+
+    if (EXTERNAL_SSE4(mm_flags)) {
+        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+    }
+}
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index d39b07b9f4..9dd6d51ee6 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 ;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index f1ff7bd986..fd6f15005a 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index 6528b57361..112566ded0 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
 
 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
 } fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct
   29692,  -12299,   26722,  -31521,
 };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
 } tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h
index c94a977e8f..648cdc5350 100644
--- a/libavcodec/x86/fdct.h
+++ b/libavcodec/x86/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c
index 4e8e4eb60d..0cb5fd625b 100644
--- a/libavcodec/x86/fdctdsp_init.c
+++ b/libavcodec/x86/fdctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index ef007f4eec..cdbfd66e82 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -51,13 +51,12 @@ struc FFTContext
     .imdcthalf:pointer 1
 endstruc
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 %define M_SQRT1_2 0.70710678118654752440
 %define M_COS_PI_1_8 0.923879532511287
 %define M_COS_PI_3_8 0.38268343236509
 
-align 32
 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 
@@ -69,11 +68,12 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0
 
+cextern ps_neg
+
 %assign i 16
-%rep 13
+%rep 14
 cextern cos_ %+ i
 %assign i i<<1
 %endrep
@@ -305,6 +305,7 @@ IF%1 mova  Z(1), m5
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 align 16
 fft8_avx:
     mova      m0, Z(0)
@@ -394,6 +395,8 @@ fft32_interleave_avx:
     jg .deint_loop
     ret
 
+%endif
+
 INIT_XMM sse
 
 align 16
@@ -537,6 +540,7 @@ DEFINE_ARGS zc, w, n, o1, o3
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 %macro INTERL_AVX 5
     vunpckhps      %3, %2, %1
     vunpcklps      %2, %2, %1
@@ -558,6 +562,7 @@ cglobal fft_calc, 2,5,8
     FFT_DISPATCH _interleave %+ SUFFIX, r1
     REP_RET
 
+%endif
 
 INIT_XMM sse
 
@@ -655,6 +660,68 @@ cglobal fft_permute, 2,7,1
     jl      .loopcopy
     REP_RET
 
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+    mov     r3d, [r0 + FFTContext.mdctsize]
+    mov     r4,  [r0 + FFTContext.imdcthalf]
+    add     r1,  r3
+    PUSH    r3
+    PUSH    r1
+%if ARCH_X86_32
+    push    r2
+    push    r1
+    push    r0
+%else
+    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
+%endif
+    call    r4
+%if ARCH_X86_32
+    add     esp, 12
+%else
+    add     rsp, 8+32*WIN64
+%endif
+    POP     r1
+    POP     r3
+    lea     r0, [r1 + 2*r3]
+    mov     r2, r3
+    sub     r3, mmsize
+    neg     r2
+    mova    m2, [ps_neg]
+.loop:
+%if mmsize == 8
+    PSWAPD  m0, [r1 + r3]
+    PSWAPD  m1, [r0 + r2]
+    pxor    m0, m2
+%else
+    mova    m0, [r1 + r3]
+    mova    m1, [r0 + r2]
+    shufps  m0, m0, 0x1b
+    shufps  m1, m1, 0x1b
+    xorps   m0, m2
+%endif
+    mova [r0 + r3], m1
+    mova [r1 + r2], m0
+    sub     r3, mmsize
+    add     r2, mmsize
+    jl      .loop
+%if cpuflag(3dnow)
+    femms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
 %if ARCH_X86_32
 INIT_MMX 3dnow
 %define mulps pfmul
@@ -689,7 +756,7 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
 %endif
 
 %assign n 1<<%1
-%rep 17-%1
+%rep 18-%1
 %assign n2 n/2
 %assign n4 n/4
 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
@@ -714,9 +781,11 @@ align 8
 dispatch_tab %+ fullsuffix: pointer list_of_fft
 %endmacro ; DECL_FFT
 
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 DECL_FFT 6
 DECL_FFT 6, _interleave
+%endif
 INIT_XMM sse
 DECL_FFT 5
 DECL_FFT 5, _interleave
@@ -729,70 +798,6 @@ DECL_FFT 4
 DECL_FFT 4, _interleave
 %endif
 
-%if CONFIG_MDCT
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
-    mov     r3d, [r0 + FFTContext.mdctsize]
-    mov     r4,  [r0 + FFTContext.imdcthalf]
-    add     r1,  r3
-    PUSH    r3
-    PUSH    r1
-%if ARCH_X86_32
-    push    r2
-    push    r1
-    push    r0
-%else
-    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
-    call    r4
-%if ARCH_X86_32
-    add     esp, 12
-%else
-    add     rsp, 8+32*WIN64
-%endif
-    POP     r1
-    POP     r3
-    lea     r0, [r1 + 2*r3]
-    mov     r2, r3
-    sub     r3, mmsize
-    neg     r2
-    mova    m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
-    PSWAPD  m0, [r1 + r3]
-    PSWAPD  m1, [r0 + r2]
-    pxor    m0, m2
-%else
-    mova    m0, [r1 + r3]
-    mova    m1, [r0 + r2]
-    shufps  m0, m0, 0x1b
-    shufps  m1, m1, 0x1b
-    xorps   m0, m2
-%endif
-    mova [r0 + r3], m1
-    mova [r1 + r2], m0
-    sub     r3, mmsize
-    add     r2, mmsize
-    jl      .loop
-%if cpuflag(3dnow)
-    femms
-    RET
-%else
-    REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
 INIT_XMM sse
 %undef mulps
 %undef addps
@@ -994,7 +999,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     sub   r4, r3
 %endif
 %if notcpuflag(3dnowext) && mmsize == 8
-    movd  m7, [ps_m1m1m1m1]
+    movd  m7, [ps_neg]
 %endif
 .pre:
 %if ARCH_X86_64 == 0
@@ -1082,6 +1087,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW
 %endif
 
 INIT_YMM avx
-DECL_IMDCT POSROTATESHUF_AVX
 
-%endif ; CONFIG_MDCT
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 94405d0cb4..398091eb1f 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,4 +27,12 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 #endif /* AVCODEC_X86_FFT_H */
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index ed1290997b..928f1dcda7 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,23 +28,33 @@ av_cold void ff_fft_init_x86(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (s->nbits > 16)
+        return;
+
 #if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnow;
+        s->imdct_half = ff_imdct_half_3dnow;
         s->fft_calc   = ff_fft_calc_3dnow;
     }
 
     if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnowext;
+        s->imdct_half = ff_imdct_half_3dnowext;
         s->fft_calc   = ff_fft_calc_3dnowext;
     }
 #endif /* ARCH_X86_32 */
 
     if (EXTERNAL_SSE(cpu_flags)) {
+        s->imdct_calc  = ff_imdct_calc_sse;
+        s->imdct_half  = ff_imdct_half_sse;
         s->fft_permute = ff_fft_permute_sse;
         s->fft_calc    = ff_fft_calc_sse;
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
     }
 
     if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
+        s->imdct_half      = ff_imdct_half_avx;
         s->fft_calc        = ff_fft_calc_avx;
         s->fft_permutation = FF_FFT_PERM_AVX;
     }
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000000..e285158185
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea  resq,   [resq+orderq*4]
+lea  smpq,   [smpq+orderq*4]
+lea  coefsq, [coefsq+orderq*4]
+sub  length,  orderd
+movd m3,      r5m
+neg  orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
+        pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
+        paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
+    movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
+    psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
+    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
+jg .looplen
+RET
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000000..7138611526
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+    pmacsdql %1, %2, %3, %1
+%else
+    pmuldq   %2, %3
+    paddq    %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+    sub    lend, pred_orderd
+    jle .ret
+    lea    decodedq, [decodedq+pred_orderq*4-8]
+    lea    coeffsq, [coeffsq+pred_orderq*4]
+    neg    pred_orderq
+    movd   m4, qlevelm
+ALIGN 16
+.loop_sample:
+    movd   m0, [decodedq+pred_orderq*4+8]
+    add    decodedq, 8
+    movd   m1, [coeffsq+pred_orderq*4]
+    pxor   m2, m2
+    pxor   m3, m3
+    lea    jq, [pred_orderq+1]
+    test   jq, jq
+    jz .end_order
+.loop_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    movd   m0, [decodedq+jq*4]
+    PMACSDQL m3, m1, m0, m3, m1
+    movd   m1, [coeffsq+jq*4]
+    inc    jq
+    jl .loop_order
+.end_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    psrlq  m2, m4
+    movd   m0, [decodedq]
+    paddd  m0, m2
+    movd   [decodedq], m0
+    sub  lend, 2
+    jl .ret
+    PMACSDQL m3, m1, m0, m3, m1
+    psrlq  m3, m4
+    movd   m1, [decodedq+4]
+    paddd  m1, m3
+    movd   [decodedq+4], m1
+    jg .loop_sample
+.ret:
+    REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+;                                                   int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    shl      lend, 2
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    add      in1q, lenq
+    add      in0q, lenq
+    add      outq, lenq
+    neg      lenq
+
+align 16
+.loop:
+    mova       m0, [in0q + lenq]
+    mova       m1, [in1q + lenq]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+%ifnidn %1, indep2
+    p%4d       m2, m0, m1
+%endif
+    packssdw  m%2, m%2
+    packssdw  m%3, m%3
+    punpcklwd m%2, m%3
+    psllw     m%2, m3
+    mova [outq + lenq], m%2
+    add      lenq, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+;                                        int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    sub      in1q, in0q
+
+align 16
+.loop:
+    mova       m0, [in0q]
+    mova       m1, [in0q + in1q]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+    p%5d       m2, m0, m1
+    pslld     m%2, m3
+    pslld     m%3, m3
+
+    SBUTTERFLY dq, %2, %3, %4
+
+    mova  [outq         ], m%2
+    mova  [outq + mmsize], m%3
+
+    add      in0q, mmsize
+    add      outq, mmsize*2
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+;                                            int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+    %define  lend  dword r3m
+%else
+    mov      lend, lenm
+%endif
+%endif
+    movd      m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+    sub      in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+    mova       m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+
+    punpcklqdq m6, m0, m2
+    punpckhqdq m2, m4
+    shufps     m4, m0, 0xe4
+    punpcklqdq m0, m1, m3
+    punpckhqdq m3, m5
+    shufps     m5, m1, 0xe4
+    SWAP 0,6,1,4,5,3
+%elif %2 == 4
+    TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+    packssdw   m0, [in0q + in4q]
+    packssdw   m1, [in0q + in5q]
+    packssdw   m2, [in0q + in6q]
+    packssdw   m3, [in0q + in7q]
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+    packssdw   m0, [in0q + in3q]
+    packssdw   m1, [in0q + in4q]
+    packssdw   m2, [in0q + in5q]
+    pshufd     m3, m0,     q1032
+    punpcklwd  m0, m1
+    punpckhwd  m1, m2
+    punpcklwd  m2, m3
+
+    shufps     m3, m0, m2, q2020
+    shufps     m0, m1,     q2031
+    shufps     m2, m1,     q3131
+    shufps     m1, m2, m3, q3120
+    shufps     m3, m0,     q0220
+    shufps     m0, m2,     q3113
+    SWAP 2, 0, 3
+%else ; %2 == 4
+    packssdw   m0, [in0q + in2q]
+    packssdw   m1, [in0q + in3q]
+    SBUTTERFLY wd, 0, 1, 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+    psll%4   m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+    mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+    add      in0q, mmsize
+    add      outq, mmsize*REPCOUNT
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000000..e28c5c9322
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt)                                                      \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                             int len, int shift);                        \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16,  avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32,  avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+                                 int bps)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_sse4;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+        }
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_xop;
+    }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+    }
+#endif
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 2a3e4a5f74..8f62a0a093 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -75,3 +75,50 @@ INIT_XMM sse
 INT32_TO_FLOAT_FMUL_SCALAR 5
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+;                                    const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+    movss     m0, [mulq]
+    SPLATD    m0
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     mulq, 4
+    add     lenq, 32
+    jl .loop
+%if notcpuflag(sse2)
+    ;; cvtpi2ps switches to MMX even if the source is a memory location
+    ;; possible an error in documentation since every tested CPU disagrees with
+    ;; that. Use emms anyway since the vast majority of machines will use the
+    ;; SSE2 variant
+    emms
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
+
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 2f10db4fc1..6d35377a77 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,10 @@
 
 void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
 void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
 
 #endif /* HAVE_YASM */
 
@@ -41,9 +45,11 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
 
     if (EXTERNAL_SSE(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
     }
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b1be289a92..961a1587a7 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,84 +25,82 @@
 
 SECTION .text
 
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+    LOAD   %3, %1
+    por    %3, %2
+    pxor   %2, %1
+    pand   %2, %4
+    psrlq  %2, 1
+    psubb  %3, %2
+    SWAP   %2, %3
+%endmacro
+
 ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
 ;                        ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN  mmsize
 %else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN  %2
 %endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
     lea          r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    pcmpeqd      m6, m6
+    paddb        m6, m6
+%endif
+%endif
 .loop:
-    OP           m0, [r1]
-    OP           m1, [r1+r2]
-    OP           m2, [r1+r2*2]
-    OP           m3, [r1+r4]
-    lea          r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+    LOAD         m0, [r1 + %%i]
+    LOAD         m1, [r1+r2 + %%i]
+    LOAD         m2, [r1+r2*2 + %%i]
+    LOAD         m3, [r1+r4 + %%i]
 %ifidn %1, avg
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
+%if notcpuflag(mmxext)
+    PAVGB_MMX    [r0 + %%i], m0, m4, m6
+    PAVGB_MMX    [r0+r2 + %%i], m1, m5, m6
+    PAVGB_MMX    [r0+r2*2 + %%i], m2, m4, m6
+    PAVGB_MMX    [r0+r4 + %%i], m3, m5, m6
+%else
+    pavgb        m0, [r0 + %%i]
+    pavgb        m1, [r0+r2 + %%i]
+    pavgb        m2, [r0+r2*2 + %%i]
+    pavgb        m3, [r0+r4 + %%i]
+%endif
 %endif
-    OP         [r0], m0
-    OP      [r0+r2], m1
-    OP    [r0+r2*2], m2
-    OP      [r0+r4], m3
+    SAVE       [r0 + %%i], m0
+    SAVE    [r0+r2 + %%i], m1
+    SAVE  [r0+r2*2 + %%i], m2
+    SAVE    [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
     sub         r3d, 4
+    lea          r1, [r1+r2*4]
     lea          r0, [r0+r2*4]
     jne       .loop
     RET
 %endmacro
 
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
 
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
 
 INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 88d1415ade..4e83cf71c3 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,18 +22,24 @@
 #include <stddef.h>
 #include <stdint.h>
 
+void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
 void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
-void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index 813bcc2b37..0000000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"FF_REG_a"   \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"FF_REG_a, "memory"
-        );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"FF_REG_a"   \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"FF_REG_a, "memory"
-        );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/g722dsp.asm b/libavcodec/x86/g722dsp.asm
new file mode 100644
index 0000000000..a529422262
--- /dev/null
+++ b/libavcodec/x86/g722dsp.asm
@@ -0,0 +1,54 @@
+;******************************************************************************
+;* SIMD optimized DSP functions for G722 coding
+;*
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_qmf_coeffs:  dw   3, -210,  -11, -805,  -11,  951,  53, 3876
+pw_qmf_coeffs2: dw  12, 3876, -156,  951,   32, -805, 362, -210
+pw_qmf_coeffs3: dw 362,    0 ,  32,    0, -156,    0,  12,    0
+pw_qmf_coeffs4: dw  53,    0,  -11,    0,  -11,    0,   3,    0
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal g722_apply_qmf, 2, 2, 5, prev, out
+    movu m0, [prevq+mmsize*0]
+    movu m1, [prevq+mmsize*1]
+    movu m2, [prevq+mmsize*2]
+    punpcklwd m3, m0, m1
+    punpckhwd m0, m1
+    punpcklwd m4, m2, m2
+    punpckhwd m2, m2
+    pmaddwd   m3, [pw_qmf_coeffs ]
+    pmaddwd   m0, [pw_qmf_coeffs2]
+    pmaddwd   m4, [pw_qmf_coeffs3]
+    pmaddwd   m2, [pw_qmf_coeffs4]
+    paddd     m0, m3
+    paddd     m2, m4
+    paddd     m0, m2
+    pshufd    m2, m0, q0032
+    paddd     m0, m2
+    pshufd    m0, m0, q0001
+    movq  [outq], m0
+    RET
diff --git a/libavcodec/x86/g722dsp_init.c b/libavcodec/x86/g722dsp_init.c
new file mode 100644
index 0000000000..614695193b
--- /dev/null
+++ b/libavcodec/x86/g722dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index cd726ba86d..77c8cf154d 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
 ;******************************************************************************
 ;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab981bf..ab81063233 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index a9cac59f0f..b5a78b537d 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
 ;*               2005-2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index ff53b91c42..34bc41969b 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -249,8 +249,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
 %define CHROMAMC_AVG  NOTHING
 INIT_XMM sse2
 CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 put
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 put
 CHROMA_MC2 put
@@ -258,8 +260,10 @@ CHROMA_MC2 put
 %define CHROMAMC_AVG  AVG
 INIT_XMM sse2
 CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 avg
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 avg
 CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 33fd5a9dd7..6702ae98d4 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -37,11 +37,6 @@ cextern pb_0
 cextern pb_1
 cextern pb_3
 
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
-    [base], [base+stride], [base+stride*2], [base3], \
-    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
 %define PASS8ROWS(base, base3, stride, stride3, offset) \
     PASS8ROWS(base+offset, base3+offset, stride, stride3)
 
@@ -287,18 +282,18 @@ cextern pb_3
 ;                        int8_t *tc0)
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA 0
-cglobal deblock_v_luma_8, 5,5,10
+cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
     movd    m8, [r4] ; tc0
-    lea     r4, [r1*3]
-    dec     r2d        ; alpha-1
+    lea     r4, [stride_q*3]
+    dec     alpha_d        ; alpha-1
     neg     r4
-    dec     r3d        ; beta-1
-    add     r4, r0     ; pix-3*stride
+    dec     beta_d        ; beta-1
+    add     base3_q, pix_q     ; pix-3*stride
 
-    mova    m0, [r4+r1]   ; p1
-    mova    m1, [r4+2*r1] ; p0
-    mova    m2, [r0]      ; q0
-    mova    m3, [r0+r1]   ; q1
+    mova    m0, [base3_q + stride_q]   ; p1
+    mova    m1, [base3_q + 2*stride_q] ; p0
+    mova    m2, [pix_q]      ; q0
+    mova    m3, [pix_q + stride_q]   ; q1
     LOAD_MASK r2d, r3d
 
     punpcklbw m8, m8
@@ -308,24 +303,24 @@ cglobal deblock_v_luma_8, 5,5,10
     pandn   m9, m7
     pand    m8, m9
 
-    movdqa  m3, [r4] ; p2
+    movdqa  m3, [base3_q] ; p2
     DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
     pand    m6, m9
     psubb   m7, m8, m6
     pand    m6, m8
-    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+    LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
 
-    movdqa  m4, [r0+2*r1] ; q2
+    movdqa  m4, [pix_q + 2*stride_q] ; q2
     DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
     pand    m6, m9
     pand    m8, m6
     psubb   m7, m6
-    mova    m3, [r0+r1]
-    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+    mova    m3, [pix_q + stride_q]
+    LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
 
     DEBLOCK_P0_Q0
-    mova    [r4+2*r1], m1
-    mova    [r0], m2
+    mova    [base3_q + 2*stride_q], m1
+    mova    [pix_q], m2
     RET
 
 ;-----------------------------------------------------------------------------
@@ -382,10 +377,101 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
     RET
 %endmacro
 
+%macro DEBLOCK_H_LUMA_MBAFF 0
+
+cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
+    movsxd stride_q,   stride_d
+    dec    alpha_d
+    dec    beta_d
+    mov    base3_q,    pix_q
+    lea    stride3_q, [3*stride_q]
+    add    base3_q,    stride3_q
+
+    movq m0, [pix_q - 4]
+    movq m1, [pix_q + stride_q - 4]
+    movq m2, [pix_q + 2*stride_q - 4]
+    movq m3, [base3_q - 4]
+    movq m4, [base3_q + stride_q - 4]
+    movq m5, [base3_q + 2*stride_q - 4]
+    movq m6, [base3_q + stride3_q - 4]
+    movq m7, [base3_q + 4*stride_q - 4]
+
+    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+
+    %assign i 0
+    %rep 8
+        movq [rsp + 16*i], m %+ i
+        %assign i i+1
+    %endrep
+
+    ; p2 = m1 [rsp + 16]
+    ; p1 = m2 [rsp + 32]
+    ; p0 = m3 [rsp + 48]
+    ; q0 = m4 [rsp + 64]
+    ; q1 = m5 [rsp + 80]
+    ; q2 = m6 [rsp + 96]
+
+    SWAP 0, 2
+    SWAP 1, 3
+    SWAP 2, 4
+    SWAP 3, 5
+
+    LOAD_MASK alpha_d, beta_d
+    movd m8, [tc0_q]
+    punpcklbw m8, m8
+    pcmpeqb m9, m9
+    pcmpeqb m9, m8
+    pandn   m9, m7
+    pand    m8, m9
+
+    movdqa  m3, [rsp + 16] ; p2
+    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+    pand    m6, m9
+    psubb   m7, m8, m6
+    pand    m6, m8
+    LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
+
+    movdqa  m4, [rsp + 96] ; q2
+    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+    pand    m6, m9
+    pand    m8, m6
+    psubb   m7, m6
+    mova    m3, [rsp + 80]
+    LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
+
+    DEBLOCK_P0_Q0
+    SWAP 1, 3
+    SWAP 2, 4
+    movq m0, [rsp]
+    movq m1, [rsp + 16]
+    movq m2, [rsp + 32]
+    movq m5, [rsp + 80]
+    movq m6, [rsp + 96]
+    movq m7, [rsp + 112]
+
+    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+    movq [pix_q - 4], m0
+    movq [pix_q + stride_q - 4], m1
+    movq [pix_q + 2*stride_q - 4], m2
+    movq [base3_q - 4], m3
+    movq [base3_q + stride_q - 4], m4
+    movq [base3_q + 2*stride_q - 4], m5
+    movq [base3_q + stride3_q - 4], m6
+    movq [base3_q + 4*stride_q - 4], m7
+
+RET
+
+%endmacro
+
 INIT_XMM sse2
+DEBLOCK_H_LUMA_MBAFF
 DEBLOCK_LUMA
+
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
+DEBLOCK_H_LUMA_MBAFF
 DEBLOCK_LUMA
+%endif
 
 %else
 
@@ -499,8 +585,10 @@ INIT_MMX mmxext
 DEBLOCK_LUMA v8, 8
 INIT_XMM sse2
 DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA v, 16
+%endif
 
 %endif ; ARCH
 
@@ -772,8 +860,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA v
+%endif
 %if ARCH_X86_64 == 0
 INIT_MMX mmxext
 DEBLOCK_LUMA_INTRA v8
@@ -836,7 +926,11 @@ cglobal deblock_h_chroma_8, 5,7
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call ff_chroma_inter_body_mmxext
+    LOAD_MASK  r2d, r3d
+    movd       m6, [r4] ; tc0
+    punpcklbw  m6, m6
+    pand       m7, m6
+    DEBLOCK_P0_Q0
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
@@ -854,7 +948,52 @@ ff_chroma_inter_body_mmxext:
     DEBLOCK_P0_Q0
     ret
 
+%define t5 r4
+%define t6 r5
+
+cglobal deblock_h_chroma422_8, 5, 6
+    SUB rsp, (1+ARCH_X86_64*2)*mmsize
+    %if ARCH_X86_64
+        %define buf0 [rsp+16]
+        %define buf1 [rsp+8]
+    %else
+        %define buf0 r0m
+        %define buf1 r2m
+    %endif
+
+    movd m6, [r4]
+    punpcklbw m6, m6
+    movq [rsp], m6
+    CHROMA_H_START
 
+    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+    movq buf0, m0
+    movq buf1, m3
+    LOAD_MASK r2d, r3d
+    movd m6, [rsp]
+    punpcklwd m6, m6
+    pand m7, m6
+    DEBLOCK_P0_Q0
+    movq m0, buf0
+    movq m3, buf1
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+    lea r0, [r0+r1*8]
+    lea t5, [t5+r1*8]
+
+    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+    movq buf0, m0
+    movq buf1, m3
+    LOAD_MASK r2d, r3d
+    movd m6, [rsp+4]
+    punpcklwd m6, m6
+    pand m7, m6
+    DEBLOCK_P0_Q0
+    movq m0, buf0
+    movq m3, buf1
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+    ADD rsp, (1+ARCH_X86_64*2)*mmsize
+RET
 
 ; in: %1=p0 %2=p1 %3=q1
 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
@@ -867,9 +1006,6 @@ ff_chroma_inter_body_mmxext:
     pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
 %endmacro
 
-%define t5 r4
-%define t6 r5
-
 ;------------------------------------------------------------------------------
 ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
 ;------------------------------------------------------------------------------
@@ -894,6 +1030,20 @@ cglobal deblock_h_chroma_intra_8, 4,6
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
     RET
 
+cglobal deblock_h_chroma422_intra_8, 4, 6
+    CHROMA_H_START
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+    call ff_chroma_intra_body_mmxext
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+    lea r0, [r0+r1*8]
+    lea t5, [t5+r1*8]
+
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+    call ff_chroma_intra_body_mmxext
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+RET
+
 ALIGN 16
 ff_chroma_intra_body_mmxext:
     LOAD_MASK r2d, r3d
@@ -909,6 +1059,202 @@ ff_chroma_intra_body_mmxext:
     paddb  m2, m6
     ret
 
+%macro LOAD_8_ROWS 8
+    movd m0, %1
+    movd m1, %2
+    movd m2, %3
+    movd m3, %4
+    movd m4, %5
+    movd m5, %6
+    movd m6, %7
+    movd m7, %8
+%endmacro
+
+%macro STORE_8_ROWS 8
+    movd %1, m0
+    movd %2, m1
+    movd %3, m2
+    movd %4, m3
+    movd %5, m4
+    movd %6, m5
+    movd %7, m6
+    movd %8, m7
+%endmacro
+
+%macro TRANSPOSE_8x4B_XMM 0
+    punpcklbw m0, m1
+    punpcklbw m2, m3
+    punpcklbw m4, m5
+    punpcklbw m6, m7
+    punpcklwd m0, m2
+    punpcklwd m4, m6
+    punpckhdq m2, m0, m4
+    punpckldq m0, m4
+    MOVHL m1, m0
+    MOVHL m3, m2
+%endmacro
+
+%macro TRANSPOSE_4x8B_XMM 0
+    punpcklbw m0, m1
+    punpcklbw m2, m3
+    punpckhwd m4, m0, m2
+    punpcklwd m0, m2
+    MOVHL m6, m4
+    MOVHL m2, m0
+    pshufd m1, m0, 1
+    pshufd m3, m2, 1
+    pshufd m5, m4, 1
+    pshufd m7, m6, 1
+%endmacro
+
+%macro CHROMA_INTER_BODY_XMM 1
+    LOAD_MASK alpha_d, beta_d
+    movd m6, [tc0_q]
+    %rep %1
+        punpcklbw m6, m6
+    %endrep
+    pand m7, m6
+    DEBLOCK_P0_Q0
+%endmacro
+
+%macro CHROMA_INTRA_BODY_XMM 0
+    LOAD_MASK alpha_d, beta_d
+    mova    m5,  m1
+    mova    m6,  m2
+    pxor    m4,  m1, m3
+    pand    m4, [pb_1]
+    pavgb   m1,  m3
+    psubusb m1,  m4
+    pavgb   m1,  m0
+    pxor    m4,  m2, m0
+    pand    m4, [pb_1]
+    pavgb   m2,  m0
+    psubusb m2,  m4
+    pavgb   m2,  m3
+    psubb   m1,  m5
+    psubb   m2,  m6
+    pand    m1,  m7
+    pand    m2,  m7
+    paddb   m1,  m5
+    paddb   m2,  m6
+%endmacro
+
+%macro CHROMA_V_START_XMM 1
+    movsxdifnidn stride_q, stride_d
+    dec alpha_d
+    dec beta_d
+    mov %1, pix_q
+    sub %1, stride_q
+    sub %1, stride_q
+%endmacro
+
+%macro CHROMA_H_START_XMM 2
+    movsxdifnidn stride_q, stride_d
+    dec alpha_d
+    dec beta_d
+    lea %2, [3*stride_q]
+    mov %1,  pix_q
+    add %1,  %2
+%endmacro
+
+%macro DEBLOCK_CHROMA_XMM 1
+
+INIT_XMM %1
+
+cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
+    CHROMA_V_START_XMM r5
+    movq m0, [r5]
+    movq m1, [r5 + stride_q]
+    movq m2, [pix_q]
+    movq m3, [pix_q + stride_q]
+    CHROMA_INTER_BODY_XMM 1
+    movq [r5 + stride_q], m1
+    movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
+    CHROMA_H_START_XMM r5, r6
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+    TRANSPOSE_8x4B_XMM
+    movq [rsp], m0
+    movq [rsp + 8], m3
+    CHROMA_INTER_BODY_XMM 1
+    movq m0, [rsp]
+    movq m3, [rsp + 8]
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_,
+    CHROMA_H_START_XMM r5, r6
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+    TRANSPOSE_8x4B_XMM
+    movq [rsp], m0
+    movq [rsp + 8], m3
+    CHROMA_INTER_BODY_XMM 2
+    movq m0, [rsp]
+    movq m3, [rsp + 8]
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+
+    lea pix_q, [pix_q + 8*stride_q]
+    lea r5,    [r5    + 8*stride_q]
+    add tc0_q,  2
+
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+    TRANSPOSE_8x4B_XMM
+    movq [rsp], m0
+    movq [rsp + 8], m3
+    CHROMA_INTER_BODY_XMM 2
+    movq m0, [rsp]
+    movq m3, [rsp + 8]
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
+    CHROMA_V_START_XMM r4
+    movq m0, [r4]
+    movq m1, [r4 + stride_q]
+    movq m2, [pix_q]
+    movq m3, [pix_q + stride_q]
+    CHROMA_INTRA_BODY_XMM
+    movq [r4 + stride_q], m1
+    movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+    CHROMA_H_START_XMM r4, r5
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+    TRANSPOSE_8x4B_XMM
+    CHROMA_INTRA_BODY_XMM
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+    CHROMA_H_START_XMM r4, r5
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+    TRANSPOSE_8x4B_XMM
+    CHROMA_INTRA_BODY_XMM
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+
+    lea pix_q, [pix_q + 8*stride_q]
+    lea r4,    [r4    + 8*stride_q]
+
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+    TRANSPOSE_8x4B_XMM
+    CHROMA_INTRA_BODY_XMM
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+%endmacro ; DEBLOCK_CHROMA_XMM
+
+DEBLOCK_CHROMA_XMM sse2
+DEBLOCK_CHROMA_XMM avx
+
 ;-----------------------------------------------------------------------------
 ; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
 ;                                   int8_t ref[2][40], int16_t mv[2][40][2],
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index d049c62bf2..1af3257a67 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,34 +7,32 @@
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 SECTION .text
 
 cextern pw_2
 cextern pw_3
 cextern pw_4
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 ; out: %4 = |%1-%2|-%3
 ; clobbers: %5
@@ -418,9 +416,11 @@ cglobal deblock_h_luma_10, 5,7,15
 
 INIT_XMM sse2
 DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_64
 %endif
+%endif
 
 %macro SWAPMOVA 2
 %ifid %1
@@ -715,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA_64
+%endif
 
 %endif
 
@@ -802,10 +804,12 @@ DEBLOCK_LUMA_INTRA
 INIT_XMM sse2
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
 %endif
+%endif
 
 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
 ; out: %1=p0', %2=q0'
@@ -839,6 +843,83 @@ DEBLOCK_LUMA_INTRA
     mova [r0+2*r1], m2
 %endmacro
 
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+    movq             m0, %1
+    movq             m2, %2
+    movq             m1, %3
+    movq             m3, %4
+
+    punpcklwd        m0, m2
+    punpcklwd        m1, m3
+    punpckhdq        m2, m0, m1
+    punpckldq        m0, m1
+
+    movq             m4, %5
+    movq             m6, %6
+    movq             m5, %7
+    movq             m3, %8
+
+    punpcklwd        m4, m6
+    punpcklwd        m5, m3
+    punpckhdq        m6, m4, m5
+    punpckldq        m4, m5
+
+    punpckhqdq       m1, m0, m4
+    punpcklqdq       m0, m4
+    punpckhqdq       m3, m2, m6
+    punpcklqdq       m2, m6
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 8
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
+    movq             %1, m0
+    movhps           %2, m0
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to store p1 and q1 values
+%macro CHROMA_H_LOAD 4
+    %if mmsize == 8
+        movq m0, [pix_q - 4]
+        movq m1, [pix_q +   stride_q - 4]
+        movq m2, [pix_q + 2*stride_q - 4]
+        movq m3, [%1 - 4]
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+    mova %3, m0
+    mova %4, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to load p1 and q1 values
+%macro CHROMA_H_STORE 4
+    mova m0, %3
+    mova m3, %4
+    %if mmsize == 8
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+        movq [pix_q - 4],              m0
+        movq [pix_q +   stride_q - 4], m1
+        movq [pix_q + 2*stride_q - 4], m2
+        movq [%1 - 4],                 m3
+    %else
+        TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+%endmacro
+
 %macro CHROMA_V_LOAD_TC 2
     movd        %1, [%2]
     punpcklbw   %1, %1
@@ -910,6 +991,81 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
 %else
     RET
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
+;                             int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+%if mmsize == 8
+    mov r6d,      2
+    .loop:
+%endif
+
+        CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        CHROMA_V_LOAD_TC m6,  tc0_q
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+%if mmsize == 8
+        lea pix_q, [pix_q + 4*stride_q]
+        lea r5,    [r5 + 4*stride_q]
+        add tc0_q,  2
+        dec r6d
+    jg .loop
+%endif
+RET
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
+;                                int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+
+    movd m0, [tc0_q]
+    punpcklbw m0, m0
+    psraw m0, 6
+    movq [rsp], m0
+
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+
+    mov r4, -8
+    .loop:
+
+        CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        movd             m6, [rsp + r4 + 8]
+        punpcklwd        m6,  m6
+        punpcklwd        m6,  m6
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+
+        lea pix_q, [pix_q + (mmsize/2)*stride_q]
+        lea r5,    [r5 +    (mmsize/2)*stride_q]
+        add r4, (mmsize/4)
+    jl .loop
+RET
+
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -918,5 +1074,7 @@ DEBLOCK_CHROMA
 %endif
 INIT_XMM sse2
 DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index ad57aa91ab..19cd128381 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,15 @@
 
 #if HAVE_INLINE_ASM
 
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define decode_significance decode_significance_x86
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
@@ -55,6 +61,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
     __asm__ volatile(
         "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -130,6 +137,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -138,7 +146,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "3:                                     \n\t"
 
         "mov %10, %0                            \n\t"
-        "movzbl (%0, %6), %k6                   \n\t"
+        "movzb (%0, %6), %6                     \n\t"
         "add %9, %6                             \n\t"
 
         BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
@@ -149,14 +157,14 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%15")
 
-        "mov %1, %k6                            \n\t"
+        "mov %1, %6                             \n\t"
         "test $1, %4                            \n\t"
         " jz 4f                                 \n\t"
 
 #ifdef BROKEN_RELOCATIONS
-        "movzbl %c14(%15, %q6), %k6\n\t"
+        "movzb %c14(%15, %q6), %6\n\t"
 #else
-        "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
+        "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
 #endif
         "add %11, %6                            \n\t"
 
@@ -169,8 +177,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              "%15")
 
         "mov %2, %0                             \n\t"
-        "mov %1, %k6                            \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %1, %6                             \n\t"
+        "mov %k6, (%0)                          \n\t"
 
         "test $1, %4                            \n\t"
         " jnz 5f                                \n\t"
@@ -178,19 +186,19 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "add"FF_OPSIZE"  $4, %2                 \n\t"
 
         "4:                                     \n\t"
-        "addl $1, %k6                           \n\t"
-        "mov %k6, %1                            \n\t"
-        "cmpl $63, %k6                          \n\t"
+        "add $1, %6                             \n\t"
+        "mov %6, %1                             \n\t"
+        "cmp $63, %6                            \n\t"
         " jb 3b                                 \n\t"
         "mov %2, %0                             \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %k6, (%0)                          \n\t"
         "5:                                     \n\t"
         "addl %8, %k0                           \n\t"
         "shr $2, %k0                            \n\t"
-        : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+        : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
           "=&r"(bit), "+&r"(c->range), "=&r"(state)
         : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
-          "m"(sig_off), "m"(last_coeff_ctx_base),
+          REG64(sig_off), REG64(last_coeff_ctx_base),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end)),
           "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
@@ -198,7 +206,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     );
     return coeff_count;
 }
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index eb99476a85..c36fea59b0 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
 ;*          Holger Lubitz <hal@duncan.ol.sub.de>
 ;*          Min Chen <chenm001.163.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -697,6 +697,38 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
     call         h264_idct_add8_mmx_plane
     RET
 
+cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
+%ifdef PIC
+    lea     picregq, [scan8_mem]
+%endif
+%if ARCH_X86_64
+    mov       dst2q, r0
+%endif
+
+    mov          r5, 16  ; i
+    add          r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
+
+    call         h264_idct_add8_mmx_plane
+    add r5, 4
+    call         h264_idct_add8_mmx_plane
+
+%if ARCH_X86_64
+    add       dst2q, gprsize ; dest[1]
+%else
+    add        r0mp, gprsize
+%endif
+
+    add r5, 4   ; set to 32
+    add r2, 256 ; set to i * 16 * sizeof(dctcoef)
+
+    call         h264_idct_add8_mmx_plane
+    add r5, 4
+    call         h264_idct_add8_mmx_plane
+
+    RET
+
 h264_idct_add8_mmxext_plane:
     movsxdifnidn r3, r3d
 .nextblock:
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 432d74b1f4..9fd05abb2b 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,32 +5,31 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pd_32:        times 4 dd 32
-
 SECTION .text
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pd_32
+
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
@@ -84,8 +83,10 @@ cglobal h264_idct_add_10, 3,3
 
 INIT_XMM sse2
 IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -118,9 +119,11 @@ add4x4_idct %+ SUFFIX:
 INIT_XMM sse2
 ALIGN 16
 ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 ALIGN 16
 ADD4x4IDCT
+%endif
 
 %macro ADD16_OP 2
     cmp          byte [r4+%2], 0
@@ -157,8 +160,10 @@ cglobal h264_idct_add16_10, 5,6
 
 INIT_XMM sse2
 IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -224,8 +229,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
 
 INIT_XMM sse2
 IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -298,8 +305,10 @@ cglobal h264_idct_add16intra_10,5,7,8
 
 INIT_XMM sse2
 IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16INTRA_10
+%endif
 
 %assign last_block 36
 ;-----------------------------------------------------------------------------
@@ -336,8 +345,63 @@ cglobal h264_idct_add8_10,5,8,7
 
 INIT_XMM sse2
 IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD8
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
+;                               int16_t *block, int stride,
+;                               const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%assign last_block 44
+
+%macro IDCT_ADD8_422 0
+
+cglobal h264_idct_add8_422_10, 5, 8, 7
+    movsxdifnidn r3, r3d
+%if ARCH_X86_64
+    mov      r7, r0
+%endif
+
+    add      r2, 1024
+    mov      r0, [r0]
+    ADD16_OP_INTRA 16, 4+ 6*8
+    ADD16_OP_INTRA 18, 4+ 7*8
+    ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
+    ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
+    add      r2, 1024-128*4
+
+%if ARCH_X86_64
+    mov      r0, [r7+gprsize]
+%else
+    mov      r0, r0m
+    mov      r0, [r0+gprsize]
+%endif
+
+    ADD16_OP_INTRA 32, 4+11*8
+    ADD16_OP_INTRA 34, 4+12*8
+    ADD16_OP_INTRA 40, 4+13*8 ; i+4
+    ADD16_OP_INTRA 42, 4+14*8 ; i+4
+REP_RET
+    AC 16
+    AC 18
+    AC 24 ; i+4
+    AC 26 ; i+4
+    AC 32
+    AC 34
+    AC 40 ; i+4
+    AC 42 ; i+4
+
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD8_422
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD8_422
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -544,8 +608,10 @@ h264_idct8_add1_10 %+ SUFFIX:
 
 INIT_XMM sse2
 IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -585,5 +651,7 @@ cglobal h264_idct8_add4_10, 0,7,16
 
 INIT_XMM sse2
 IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index 1ea97fa1ca..f3aa3172f0 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
 ;* Copyright (c) 2010 Loren Merritt
 ;* Copyright (c) 2010 Ronald S. Bultje
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -268,6 +268,43 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
     jg .loop
     REP_RET
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+    sub                       dstq, strideq
+    pmovzxbw                    m0, [dstq]
+    vpbroadcastb               xm1, [r0-1]
+    pmovzxbw                    m1, xm1
+    psubw                       m0, m1
+    mov                 iterationd, 4
+    lea                   stride3q, [strideq*3]
+.loop:
+    vpbroadcastb               xm1, [dstq+strideq*1-1]
+    vpbroadcastb               xm2, [dstq+strideq*2-1]
+    vpbroadcastb               xm3, [dstq+stride3q-1]
+    vpbroadcastb               xm4, [dstq+strideq*4-1]
+    pmovzxbw                    m1, xm1
+    pmovzxbw                    m2, xm2
+    pmovzxbw                    m3, xm3
+    pmovzxbw                    m4, xm4
+    paddw                       m1, m0
+    paddw                       m2, m0
+    paddw                       m3, m0
+    paddw                       m4, m0
+    vpackuswb                   m1, m1, m2
+    vpackuswb                   m3, m3, m4
+    vpermq                      m1, m1, q3120
+    vpermq                      m3, m3, q3120
+    movdqa        [dstq+strideq*1], xm1
+    vextracti128  [dstq+strideq*2], m1, 1
+    movdqa       [dstq+stride3q*1], xm3
+    vextracti128  [dstq+strideq*4], m3, 1
+    lea                       dstq, [dstq+strideq*4]
+    dec                 iterationd
+    jg .loop
+    REP_RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
 ;-----------------------------------------------------------------------------
@@ -2498,10 +2535,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
     pshufb     mm3, mm6
     pshufb     mm4, mm6
     pshufb     mm5, mm6
-    psubw      mm2, mm7
-    psubw      mm3, mm7
-    psubw      mm4, mm7
-    psubw      mm5, mm7
+    psubw      mm0, mm7
     paddw      mm2, mm0
     paddw      mm3, mm0
     paddw      mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 7ba9828e17..629e0a72e3 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,18 +26,19 @@
 
 SECTION_RODATA
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
 cextern pw_512
 cextern pw_16
 cextern pw_8
 cextern pw_4
 cextern pw_2
 cextern pw_1
+cextern pd_16
 
 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:        times 8 dw -3
-pw_pixel_max: times 8 dw ((1 << 10)-1)
 pd_17:        times 4 dd 17
-pd_16:        times 4 dd 16
 
 SECTION .text
 
@@ -83,8 +84,10 @@ INIT_XMM sse2
 PRED4x4_DR
 INIT_XMM ssse3
 PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DR
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
@@ -121,8 +124,10 @@ INIT_XMM sse2
 PRED4x4_VR
 INIT_XMM ssse3
 PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VR
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
@@ -162,28 +167,14 @@ INIT_XMM sse2
 PRED4x4_HD
 INIT_XMM ssse3
 PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_HD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
 ;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
-    movhlps %2, %1
-    paddd   %1, %2
-    pshuflw %2, %1, 0xE
-    paddd   %1, %2
-%else
-    pshufw  %2, %1, 0xE
-    paddd   %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
-    pmaddwd %1, [pw_1]
-    HADDD   %1, %2
-%endmacro
 
 INIT_MMX mmxext
 cglobal pred4x4_dc_10, 3, 3
@@ -232,8 +223,10 @@ cglobal pred4x4_down_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
@@ -260,8 +253,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
@@ -571,8 +566,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_TOP_DC
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
@@ -629,8 +626,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
 
 INIT_XMM sse2
 PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DC
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
@@ -663,8 +662,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
@@ -718,8 +719,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
@@ -785,8 +788,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_LEFT
 INIT_XMM ssse3
 PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_LEFT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
@@ -858,8 +863,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_RIGHT
 INIT_XMM ssse3
 PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
@@ -927,8 +934,10 @@ INIT_XMM sse2
 PRED8x8L_VERTICAL_RIGHT
 INIT_XMM ssse3
 PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
@@ -987,8 +996,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL_UP
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL_UP
+%endif
 
 
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 0e572b1226..bdd5125d68 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
 PRED16x16(tm_vp8, 8, mmx)
 PRED16x16(tm_vp8, 8, mmxext)
 PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
 
 PRED8x8(top_dc, 8, mmxext)
 PRED8x8(dc_rv40, 8, mmxext)
@@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                 }
             }
         }
+
+        if(EXTERNAL_AVX2(cpu_flags)){
+            if (codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_avx2;
+            }
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 74458259a6..6b7ecf52b3 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  * Copyright (c) 2011 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,10 +29,6 @@
 #include "fpel.h"
 
 #if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
-void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                               int dstStride, int src1Stride, int h);
 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -49,9 +45,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -284,7 +280,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
 }\
@@ -296,7 +292,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uin
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
 }\
@@ -304,74 +300,74 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+    LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index 4557e5e209..872268300a 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,12 +26,13 @@
 
 SECTION_RODATA 32
 
+cextern pd_65535
+cextern pw_1023
+%define pw_pixel_max pw_1023
 cextern pw_16
 cextern pw_1
 cextern pb_0
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 pad10: times 8 dw 10*1023
 pad20: times 8 dw 20*1023
 pad30: times 8 dw 30*1023
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
 tap1: times 4 dw  1, -5
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
-pd_0f: times 4 dd 0xffff
 
 SECTION .text
 
@@ -708,7 +708,7 @@ h%1_loop_op:
     psrad      m1, 10
     psrad      m2, 10
     pslld      m2, 16
-    pand       m1, [pd_0f]
+    pand       m1, [pd_65535]
     por        m1, m2
 %if num_mmregs <= 8
     pxor       m0, m0
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c72541b..2d287ba443 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index f9da05b215..0975d74fcf 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8ca45..f924e55854 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,11 +26,12 @@
 
 SECTION_RODATA 32
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
 sq_1: dq 1
       dq 0
 
 cextern pw_1
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 SECTION .text
 
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 0d5ff3d159..36bf29df02 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 134d594ca9..0643b37362 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -78,6 +78,11 @@ IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
 IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
 IDCT_ADD_REP_FUNC2(, 8, 10, avx)
 
+IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
+
 void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
 void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
 
@@ -103,28 +108,37 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
                                                                int beta);
 
 #define LF_FUNCS(type, depth)                   \
-LF_FUNC(h,  chroma,       depth, mmxext)        \
-LF_IFUNC(h, chroma_intra, depth, mmxext)        \
-LF_FUNC(v,  chroma,       depth, mmxext)        \
-LF_IFUNC(v, chroma_intra, depth, mmxext)        \
-LF_FUNC(h,  luma,         depth, mmxext)        \
-LF_IFUNC(h, luma_intra,   depth, mmxext)        \
-LF_FUNC(h,  luma,         depth, sse2)          \
-LF_IFUNC(h, luma_intra,   depth, sse2)          \
-LF_FUNC(v,  luma,         depth, sse2)          \
-LF_IFUNC(v, luma_intra,   depth, sse2)          \
-LF_FUNC(h,  chroma,       depth, sse2)          \
-LF_IFUNC(h, chroma_intra, depth, sse2)          \
-LF_FUNC(v,  chroma,       depth, sse2)          \
-LF_IFUNC(v, chroma_intra, depth, sse2)          \
-LF_FUNC(h,  luma,         depth, avx)           \
-LF_IFUNC(h, luma_intra,   depth, avx)           \
-LF_FUNC(v,  luma,         depth, avx)           \
-LF_IFUNC(v, luma_intra,   depth, avx)           \
-LF_FUNC(h,  chroma,       depth, avx)           \
-LF_IFUNC(h, chroma_intra, depth, avx)           \
-LF_FUNC(v,  chroma,       depth, avx)           \
-LF_IFUNC(v, chroma_intra, depth, avx)
+LF_FUNC(h,  chroma,          depth, mmxext)     \
+LF_IFUNC(h, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  chroma422,       depth, mmxext)     \
+LF_IFUNC(h, chroma422_intra, depth, mmxext)     \
+LF_FUNC(v,  chroma,          depth, mmxext)     \
+LF_IFUNC(v, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, mmxext)     \
+LF_IFUNC(h, luma_intra,      depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, sse2)       \
+LF_IFUNC(h, luma_intra,      depth, sse2)       \
+LF_FUNC(v,  luma,            depth, sse2)       \
+LF_IFUNC(v, luma_intra,      depth, sse2)       \
+LF_FUNC(h,  chroma,          depth, sse2)       \
+LF_IFUNC(h, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  chroma422,       depth, sse2)       \
+LF_IFUNC(h, chroma422_intra, depth, sse2)       \
+LF_FUNC(v,  chroma,          depth, sse2)       \
+LF_IFUNC(v, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  luma,            depth, avx)        \
+LF_IFUNC(h, luma_intra,      depth, avx)        \
+LF_FUNC(v,  luma,            depth, avx)        \
+LF_IFUNC(v, luma_intra,      depth, avx)        \
+LF_FUNC(h,  chroma,          depth, avx)        \
+LF_IFUNC(h, chroma_intra,    depth, avx)        \
+LF_FUNC(h,  chroma422,       depth, avx)        \
+LF_IFUNC(h, chroma422_intra, depth, avx)        \
+LF_FUNC(v,  chroma,          depth, avx)        \
+LF_IFUNC(v, chroma_intra,    depth, avx)
+
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
 
 LF_FUNCS(uint8_t,   8)
 LF_FUNCS(uint16_t, 10)
@@ -155,13 +169,13 @@ LF_IFUNC(v, luma_intra, 10, mmxext)
 /* weighted prediction */
 
 #define H264_WEIGHT(W, OPT)                                             \
-void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride,         \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride,   \
                                       int height, int log2_denom,       \
                                       int weight, int offset);
 
 #define H264_BIWEIGHT(W, OPT)                                           \
 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
-                                        int stride, int height,         \
+                                        ptrdiff_t stride, int height,   \
                                         int log2_denom, int weightd,    \
                                         int weights, int offset);
 
@@ -181,7 +195,7 @@ H264_BIWEIGHT_MMX(4)
 
 #define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
-                                                    int stride,         \
+                                                    ptrdiff_t stride,   \
                                                     int height,         \
                                                     int log2_denom,     \
                                                     int weight,         \
@@ -190,7 +204,7 @@ void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
 #define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
                                                       uint8_t *src,     \
-                                                      int stride,       \
+                                                      ptrdiff_t stride, \
                                                       int height,       \
                                                       int log2_denom,   \
                                                       int weightd,      \
@@ -210,6 +224,7 @@ H264_BIWEIGHT_10_SSE(4,  10)
 av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
@@ -224,8 +239,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
             c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
+            }
             c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
             if (cpu_flags & AV_CPU_FLAG_CMOV)
                 c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
@@ -244,6 +262,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             if (chroma_format_idc <= 1) {
                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
             }
 #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
             c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
@@ -279,6 +300,20 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_sse2;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
+            }
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
             c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
@@ -289,12 +324,30 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_avx;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
+            }
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
 #if ARCH_X86_32
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
+            }
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
             c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
@@ -307,8 +360,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
 
             c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
+            }
             c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
 #if HAVE_ALIGNED_STACK
             c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
@@ -325,6 +381,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
@@ -347,8 +408,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
 
             c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
+            }
             c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
 #if HAVE_ALIGNED_STACK
             c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
@@ -357,6 +421,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
@@ -365,4 +434,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #endif /* HAVE_ALIGNED_STACK */
         }
     }
+#endif
 }
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
index 66b929c594..d97e4abddb 100644
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -2,49 +2,51 @@
 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; *
-; * This file is part of Libav.
+; * This file is part of FFmpeg.
 ; *
-; * Libav is free software; you can redistribute it and/or
+; * FFmpeg is free software; you can redistribute it and/or
 ; * modify it under the terms of the GNU Lesser General Public
 ; * License as published by the Free Software Foundation; either
 ; * version 2.1 of the License, or (at your option) any later version.
 ; *
-; * Libav is distributed in the hope that it will be useful,
+; * FFmpeg is distributed in the hope that it will be useful,
 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ; * Lesser General Public License for more details.
 ; *
 ; * You should have received a copy of the GNU Lesser General Public
-; * License along with Libav; if not, write to the Free Software
+; * License along with FFmpeg; if not, write to the Free Software
 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ; ******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
-max_pixels_10:          times 16  dw ((1 << 10)-1)
-
 SECTION .text
 
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
 %macro ADD_RES_MMX_4_8 0
-    mova              m0, [r1]
-    mova              m2, [r1+8]
-    pxor              m1, m1
+    mova              m2, [r1]
+    mova              m4, [r1+8]
     pxor              m3, m3
-    psubw             m1, m0
     psubw             m3, m2
-    packuswb          m0, m2
-    packuswb          m1, m3
-
-    movd              m2, [r0]
-    movd              m3, [r0+r2]
-    punpckldq         m2, m3
+    packuswb          m2, m2
+    packuswb          m3, m3
+    pxor              m5, m5
+    psubw             m5, m4
+    packuswb          m4, m4
+    packuswb          m5, m5
+
+    movh              m0, [r0]
+    movh              m1, [r0+r2]
     paddusb           m0, m2
-    psubusb           m0, m1
-    movd            [r0], m0
-    psrlq             m0, 32
-    movd         [r0+r2], m0
+    paddusb           m1, m4
+    psubusb           m0, m3
+    psubusb           m1, m5
+    movh            [r0], m0
+    movh         [r0+r2], m1
 %endmacro
 
 
@@ -93,8 +95,15 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
     vinserti128       m2, m2, [r1+%1+32], 1
     vinserti128       m6, m6, [r1+%1+48], 1
 %endif
+%if cpuflag(avx)
     psubw             m1, m0, m2
     psubw             m5, m0, m6
+%else
+    mova              m1, m0
+    mova              m5, m0
+    psubw             m1, m2
+    psubw             m5, m6
+%endif
     packuswb          m2, m6
     packuswb          m1, m5
 
@@ -104,8 +113,15 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
     vinserti128       m4, m4, [r1+%1+96 ], 1
     vinserti128       m6, m6, [r1+%1+112], 1
 %endif
+%if cpuflag(avx)
     psubw             m3, m0, m4
     psubw             m5, m0, m6
+%else
+    mova              m3, m0
+    mova              m5, m0
+    psubw             m3, m4
+    psubw             m5, m6
+%endif
     packuswb          m4, m6
     packuswb          m3, m5
 
@@ -176,7 +192,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
     dec                 r4d
     jg .loop
     RET
-%endif ;HAVE_AVX2_EXTERNAL
+%endif
 
 %macro ADD_RES_SSE_8_10 4
     mova              m0, [%4]
@@ -330,7 +346,7 @@ cglobal hevc_add_residual_32_10, 3, 5, 6
     mova              m5, [max_pixels_10]
 
     mov              r4d, 32
-.loop
+.loop:
     ADD_RES_SSE_32_10 r0, r1
     lea               r0, [r0+r2]
     add               r1, 64
@@ -346,7 +362,7 @@ cglobal hevc_add_residual_16_10, 3, 5, 6
     lea                r3, [r2*3]
 
     mov               r4d, 4
-.loop
+.loop:
     ADD_RES_AVX2_16_10 r0, r2, r3, r1
     lea                r0, [r0+r2*4]
     add                r1, 128
@@ -359,7 +375,7 @@ cglobal hevc_add_residual_32_10, 3, 5, 6
     mova               m5, [max_pixels_10]
 
     mov               r4d, 16
-.loop
+.loop:
     ADD_RES_AVX2_32_10 r0, r2, r1
     lea                r0, [r0+r2*2]
     add                r1, 128
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 153eaf7f94..85ee4800bb 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,9 +26,11 @@
 
 SECTION_RODATA
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_m2:        times 8 dw -2
-pd_1 :        times 4 dd  1
+cextern pw_1023
+%define pw_pixel_max_10 pw_1023
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_m2:           times 8 dw -2
+pd_1 :           times 4 dd  1
 
 cextern pw_4
 cextern pw_8
@@ -37,11 +39,6 @@ cextern pw_m1
 SECTION .text
 INIT_XMM sse2
 
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
-    [base], [base+stride], [base+stride*2], [base3], \
-    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
 ; in: 8 rows of 4 bytes in %4..%11
 ; out: 4 rows of 8 words in m0..m3
 %macro TRANSPOSE4x8B_LOAD 8
@@ -57,10 +54,10 @@ INIT_XMM sse2
     movd             m4, %5
     movd             m6, %6
     movd             m5, %7
-    movd             m7, %8
+    movd             m3, %8
 
     punpcklbw        m4, m6
-    punpcklbw        m5, m7
+    punpcklbw        m5, m3
     punpcklwd        m4, m5
 
     punpckhdq        m2, m0, m4
@@ -76,16 +73,10 @@ INIT_XMM sse2
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
 %macro TRANSPOSE8x4B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m6, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m2
+    packuswb         m1, m3
+    SBUTTERFLY bw, 0, 1, 2
+    SBUTTERFLY wd, 0, 1, 2
 
     movd             %1, m0
     pshufd           m0, m0, 0x39
@@ -95,13 +86,13 @@ INIT_XMM sse2
     pshufd           m0, m0, 0x39
     movd             %4, m0
 
-    movd             %5, m6
-    pshufd           m6, m6, 0x39
-    movd             %6, m6
-    pshufd           m6, m6, 0x39
-    movd             %7, m6
-    pshufd           m6, m6, 0x39
-    movd             %8, m6
+    movd             %5, m1
+    pshufd           m1, m1, 0x39
+    movd             %6, m1
+    pshufd           m1, m1, 0x39
+    movd             %7, m1
+    pshufd           m1, m1, 0x39
+    movd             %8, m1
 %endmacro
 
 ; in: 8 rows of 4 words in %4..%11
@@ -120,10 +111,10 @@ INIT_XMM sse2
     movq             m4, %5
     movq             m6, %6
     movq             m5, %7
-    movq             m7, %8
+    movq             m3, %8
 
     punpcklwd        m4, m6
-    punpcklwd        m5, m7
+    punpcklwd        m5, m3
     punpckhdq        m6, m4, m5
     punpckldq        m4, m5
 
@@ -136,32 +127,23 @@ INIT_XMM sse2
 
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 words in %1..%8
-%macro TRANSPOSE8x4W_STORE 8
-    pxor             m5, m5; zeros reg
-    CLIPW            m0, m5, [pw_pixel_max]
-    CLIPW            m1, m5, [pw_pixel_max]
-    CLIPW            m2, m5, [pw_pixel_max]
-    CLIPW            m3, m5, [pw_pixel_max]
+%macro TRANSPOSE8x4W_STORE 9
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
 
-    punpckhwd        m4, m0, m1
-    punpcklwd        m0, m1
-    punpckhwd        m5, m2, m3
-    punpcklwd        m2, m3
-    punpckhdq        m6, m0, m2
-    punpckldq        m0, m2
+    pxor             m5, m5; zeros reg
+    CLIPW            m0, m5, %9
+    CLIPW            m1, m5, %9
+    CLIPW            m2, m5, %9
+    CLIPW            m3, m5, %9
 
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m6
-    movhps           %4, m6
-
-    punpckhdq        m6, m4, m5
-    punpckldq        m4, m5
-
-    movq             %5, m4
-    movhps           %6, m4
-    movq             %7, m6
-    movhps           %8, m6
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 bytes in %1..%8
@@ -212,40 +194,20 @@ INIT_XMM sse2
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 bytes in %1..%8
 %macro TRANSPOSE8x8B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-    packuswb         m4, m4
-    packuswb         m5, m5
-    packuswb         m6, m6
-    packuswb         m7, m7
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m8, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m4
+    packuswb         m1, m5
+    packuswb         m2, m6
+    packuswb         m3, m7
+    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
 
-    punpcklbw        m4, m5
-    punpcklbw        m6, m7
-
-    punpckhwd        m9, m4, m6
-    punpcklwd        m4, m6
-
-    punpckhdq       m10, m0, m4; 2, 3
-    punpckldq        m0, m4;   0, 1
-
-    punpckldq       m11, m8, m9;  4, 5
-    punpckhdq        m8, m9;   6, 7
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m10
-    movhps           %4, m10
-    movq             %5, m11
-    movhps           %6, m11
-    movq             %7, m8
-    movhps           %8, m8
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 words in %1..%8
@@ -264,18 +226,18 @@ INIT_XMM sse2
 
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 words in %1..%8
-%macro TRANSPOSE8x8W_STORE 8
+%macro TRANSPOSE8x8W_STORE 9
     TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
 
     pxor             m8, m8
-    CLIPW            m0, m8, [pw_pixel_max]
-    CLIPW            m1, m8, [pw_pixel_max]
-    CLIPW            m2, m8, [pw_pixel_max]
-    CLIPW            m3, m8, [pw_pixel_max]
-    CLIPW            m4, m8, [pw_pixel_max]
-    CLIPW            m5, m8, [pw_pixel_max]
-    CLIPW            m6, m8, [pw_pixel_max]
-    CLIPW            m7, m8, [pw_pixel_max]
+    CLIPW            m0, m8, %9
+    CLIPW            m1, m8, %9
+    CLIPW            m2, m8, %9
+    CLIPW            m3, m8, %9
+    CLIPW            m4, m8, %9
+    CLIPW            m5, m8, %9
+    CLIPW            m6, m8, %9
+    CLIPW            m7, m8, %9
 
     movdqu           %1, m0
     movdqu           %2, m1
@@ -318,13 +280,14 @@ ALIGN 16
     paddw            m5, m4;
 
     ;tc calculations
-    movd             m6, [r2]; tc0
-    add              r2, 4;
+    movq             m6, [tcq]; tc0
     punpcklwd        m6, m6
-    movd             m7, [r2]; tc1
-    punpcklwd        m7, m7
-    shufps           m6, m7, 0; tc0, tc1
+    pshufd           m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+    psignw           m4, m6, [pw_m1]; -tc0, -tc1
+%else
     pmullw           m4, m6, [pw_m1]; -tc0, -tc1
+%endif
     ;end tc calculations
 
     paddw            m5, [pw_4]; +4
@@ -362,11 +325,11 @@ ALIGN 16
 
     paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
 
-    pshufhw         m14, m9,  q0033 ;0b00001111;  0d3 0d3 0d0 0d0 in high
-    pshuflw         m14, m14, q0033 ;0b00001111;  1d3 1d3 1d0 1d0 in low
+    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
+    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
 
-    pshufhw          m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
-    pshuflw          m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
+    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
 
     paddw           m14, m9; 0d0+0d3, 1d0+1d3
 
@@ -380,7 +343,7 @@ ALIGN 16
     psraw           m15, m13, 2;   beta >> 2
     psllw            m8, m9, 1;
     pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
-    movmskps        r14, m15;
+    movmskps        r6, m15;
     ;end weak / strong decision
 
     ; weak filter nd_p/q calculation
@@ -388,19 +351,15 @@ ALIGN 16
     psrld            m8, 16
     paddw            m8, m10
     movd            r7d, m8
-    and              r7, 0xffff; 1dp0 + 1dp3
     pshufd           m8, m8, 0x4E
     movd            r8d, m8
-    and              r8, 0xffff; 0dp0 + 0dp3
 
     pshufd           m8, m11, 0x31
     psrld            m8, 16
     paddw            m8, m11
     movd            r9d, m8
-    and              r9, 0xffff; 1dq0 + 1dq3
     pshufd           m8, m8, 0x4E
     movd           r10d, m8
-    and             r10, 0xffff; 0dq0 + 0dq3
     ; end calc for weak filter
 
     ; filtering mask
@@ -422,14 +381,13 @@ ALIGN 16
     shl             r11, %1 - 8
 %endif
     movd             m8, r11d; tc0
-    add             tcq, 4;
-    mov             r3d, [tcq];
+    mov             r3d, [tcq+4];
 %if %1 > 8
     shl              r3, %1 - 8
 %endif
-    movd             m9, r3d; tc1
     add            r11d, r3d; tc0 + tc1
     jz             .bypassluma
+    movd             m9, r3d; tc1
     punpcklwd        m8, m8
     punpcklwd        m9, m9
     shufps           m8, m9, 0; tc0, tc1
@@ -453,7 +411,7 @@ ALIGN 16
     psraw           m13, 3; beta >> 3
     pcmpgtw         m13, m12;
     movmskps        r11, m13;
-    and             r14, r11; strong mask , beta_2 and beta_3 comparisons
+    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
     ;----beta_3 comparison end-----
     ;----tc25 comparison---
     psubw           m12, m3, m4;      p0 - q0
@@ -464,23 +422,23 @@ ALIGN 16
 
     pcmpgtw          m8, m12; tc25 comparisons
     movmskps        r11, m8;
-    and             r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
     ;----tc25 comparison end---
-    mov             r11, r14;
+    mov             r11, r6;
     shr             r11, 1;
-    and             r14, r11; strong mask, bits 2 and 0
+    and             r6, r11; strong mask, bits 2 and 0
 
     pmullw          m14, m9, [pw_m2]; -tc * 2
     paddw            m9, m9
 
-    and             r14, 5; 0b101
-    mov             r11, r14; strong mask
-    shr             r14, 2;
-    movd            m12, r14d; store to xmm for mask generation
-    shl             r14, 1
+    and             r6, 5; 0b101
+    mov             r11, r6; strong mask
+    shr             r6, 2;
+    movd            m12, r6d; store to xmm for mask generation
+    shl             r6, 1
     and             r11, 1
     movd            m10, r11d; store to xmm for mask generation
-    or              r14, r11; final strong mask, bits 1 and 0
+    or              r6, r11; final strong mask, bits 1 and 0
     jz      .weakfilter
 
     shufps          m10, m12, 0
@@ -565,16 +523,16 @@ ALIGN 16
     MASKED_COPY      m3, m12
 
 .weakfilter:
-    not             r14; strong mask -> weak mask
-    and             r14, r13; final weak filtering mask, bits 0 and 1
+    not             r6; strong mask -> weak mask
+    and             r6, r13; final weak filtering mask, bits 0 and 1
     jz             .store
 
     ; weak filtering mask
-    mov             r11, r14
+    mov             r11, r6
     shr             r11, 1
     movd            m12, r11d
-    and             r14, 1
-    movd            m11, r14d
+    and             r6, 1
+    movd            m11, r6d
     shufps          m11, m12, 0
     pcmpeqd         m11, [pd_1]; filtering mask
 
@@ -609,7 +567,11 @@ ALIGN 16
     pminsw          m12, m9;  av_clip(delta0, -tc, tc)
 
     psraw            m9, 1;   tc -> tc / 2
+%if cpuflag(ssse3)
+    psignw          m14, m9, [pw_m1]; -tc / 2
+%else
     pmullw          m14, m9, [pw_m1]; -tc / 2
+%endif
 
     pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
     psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
@@ -658,117 +620,161 @@ ALIGN 16
     MASKED_COPY      m4, m8
 %endmacro
 
-INIT_XMM sse2
 ;-----------------------------------------------------------------------------
-; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
-    sub              r0, 2
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8B_LOAD  PASS8ROWS(r4, r0, r1, r5)
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 2
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 8
-    TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     RET
 
-cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(r4, r0, r1, r5)
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 10
-    TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+    RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 12
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
     RET
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
-    mov              r5, r0; pix
-    sub              r5, r1
-    sub              r5, r1
-    movh             m0, [r5];      p1
-    movh             m1, [r5 + r1]; p0
-    movh             m2, [r0];      q0
-    movh             m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+    mov           pix0q, pixq
+    sub           pix0q, strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];    p1
+    movq             m1, [pix0q+strideq]; p0
+    movq             m2, [pixq];    q0
+    movq             m3, [pixq+strideq]; q1
     pxor             m5, m5; zeros reg
     punpcklbw        m0, m5
     punpcklbw        m1, m5
     punpcklbw        m2, m5
     punpcklbw        m3, m5
     CHROMA_DEBLOCK_BODY  8
-    packuswb          m1, m2
-    movh       [r5 + r1], m1
-    movhps          [r0], m1
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
     RET
 
-cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
-    mov             r5, r0; pix
-    sub             r5, r1
-    sub             r5, r1
-    movdqu          m0, [r5];      p1
-    movdqu          m1, [r5+r1];   p0
-    movdqu          m2, [r0];      q0
-    movdqu          m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
     CHROMA_DEBLOCK_BODY 10
     pxor            m5, m5; zeros reg
-    CLIPW           m1, m5, [pw_pixel_max]
-    CLIPW           m2, m5, [pw_pixel_max]
-    movdqu   [r5 + r1], m1
-    movdqu        [r0], m2
+    CLIPW           m1, m5, [pw_pixel_max_10]
+    CLIPW           m2, m5, [pw_pixel_max_10]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 12
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_12]
+    CLIPW           m2, m5, [pw_pixel_max_12]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
     RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
 
 %if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r6, r0
-    add              r0, r5
-    TRANSPOSE8x8B_LOAD  PASS8ROWS(r6, r0, r1, r5)
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 4
+    lea           pix0q, [3 * r1]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
     LUMA_DEBLOCK_BODY 8, v
 .store:
-    TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
 .bypassluma:
     RET
 
-cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     sub            pixq, 8
-    lea              r5, [3 * strideq]
-    mov              r6, pixq
-    add            pixq, r5
-    TRANSPOSE8x8W_LOAD  PASS8ROWS(r6, pixq, strideq, r5)
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
     LUMA_DEBLOCK_BODY 10, v
 .store:
-    TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 12, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
 .bypassluma:
     RET
 
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea     src3strideq, [3 * strideq]
     mov           pix0q, pixq
     sub           pix0q, src3strideq
     sub           pix0q, strideq
-    movdqu           m0, [pix0q];               p3
-    movdqu           m1, [pix0q +     strideq]; p2
-    movdqu           m2, [pix0q + 2 * strideq]; p1
-    movdqu           m3, [pix0q + src3strideq]; p0
-    movdqu           m4, [pixq];                q0
-    movdqu           m5, [pixq +     strideq];  q1
-    movdqu           m6, [pixq + 2 * strideq];  q2
-    movdqu           m7, [pixq + src3strideq];  q3
+    movq             m0, [pix0q];               p3
+    movq             m1, [pix0q +     strideq]; p2
+    movq             m2, [pix0q + 2 * strideq]; p1
+    movq             m3, [pix0q + src3strideq]; p0
+    movq             m4, [pixq];                q0
+    movq             m5, [pixq +     strideq];  q1
+    movq             m6, [pixq + 2 * strideq];  q2
+    movq             m7, [pixq + src3strideq];  q3
     pxor             m8, m8
     punpcklbw        m0, m8
     punpcklbw        m1, m8
@@ -783,16 +789,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
     packuswb          m1, m2
     packuswb          m3, m4
     packuswb          m5, m6
-    movh   [r5 +     r1], m1
-    movhps [r5 + 2 * r1], m1
-    movh   [r5 +     r6], m3
-    movhps [r0         ], m3
-    movh   [r0 +     r1], m5
-    movhps [r0 + 2 * r1], m5
+    movh   [pix0q +     strideq], m1
+    movhps [pix0q + 2 * strideq], m1
+    movh   [pix0q + src3strideq], m3
+    movhps [pixq               ], m3
+    movh   [pixq  +     strideq], m5
+    movhps [pixq  + 2 * strideq], m5
 .bypassluma:
     RET
 
-cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea                  src3strideq, [3 * strideq]
     mov                        pix0q, pixq
     sub                        pix0q, src3strideq
@@ -808,12 +814,12 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     LUMA_DEBLOCK_BODY             10, h
 .store:
     pxor                          m8, m8; zeros reg
-    CLIPW                         m1, m8, [pw_pixel_max]
-    CLIPW                         m2, m8, [pw_pixel_max]
-    CLIPW                         m3, m8, [pw_pixel_max]
-    CLIPW                         m4, m8, [pw_pixel_max]
-    CLIPW                         m5, m8, [pw_pixel_max]
-    CLIPW                         m6, m8, [pw_pixel_max]
+    CLIPW                         m1, m8, [pw_pixel_max_10]
+    CLIPW                         m2, m8, [pw_pixel_max_10]
+    CLIPW                         m3, m8, [pw_pixel_max_10]
+    CLIPW                         m4, m8, [pw_pixel_max_10]
+    CLIPW                         m5, m8, [pw_pixel_max_10]
+    CLIPW                         m6, m8, [pw_pixel_max_10]
     movdqu     [pix0q +     strideq], m1;  p2
     movdqu     [pix0q + 2 * strideq], m2;  p1
     movdqu     [pix0q + src3strideq], m3;  p0
@@ -822,4 +828,44 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     movdqu     [pixq  + 2 * strideq], m6;  q2
 .bypassluma:
     RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             12, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_12]
+    CLIPW                         m2, m8, [pw_pixel_max_12]
+    CLIPW                         m3, m8, [pw_pixel_max_12]
+    CLIPW                         m4, m8, [pw_pixel_max_12]
+    CLIPW                         m5, m8, [pw_pixel_max_12]
+    CLIPW                         m6, m8, [pw_pixel_max_12]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
 %endif
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index f397cc1097..1eb1973f27 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2014 James Almer
 ;* Copyright (c) 2016 Alexandra Hájková
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -234,9 +234,9 @@ times 4 dw 78, -82
 times 4 dw 85, -88
 times 4 dw 90, -90
 
-section .text
+SECTION .text
 
-; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
 ; %1 = HxW
 ; %2 = number of loops
 ; %3 = bitdepth
@@ -844,7 +844,10 @@ IDCT_4x4 %1
 
 INIT_IDCT_DC 8
 INIT_IDCT_DC 10
+INIT_IDCT_DC 12
 INIT_IDCT 8, sse2
 INIT_IDCT 8, avx
 INIT_IDCT 10, sse2
 INIT_IDCT 10, avx
+;INIT_IDCT 12, sse2
+;INIT_IDCT 12, avx
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 16e5eefc69..ff6ed0711a 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -1,851 +1,1672 @@
-;*****************************************************************************
-;* x86-optimized HEVC MC
-;* Copyright 2015 Anton Khirnov
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_8192
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+%define pw_8 pw_512
+%define pw_10 pw_2048
+%define pw_12 pw_8192
+%define pw_bi_10 pw_1024
+%define pw_bi_12 pw_4096
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+pw_bi_8:                times 16 dw  (1 <<  8)
+max_pixels_12:          times 16 dw ((1 << 12)-1)
+cextern pd_1
+cextern pb_0
+
+%macro EPEL_TABLE 4
+hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+                        times %2 d%3 10, -2
+                        times %2 d%3 -4, 54
+                        times %2 d%3 16, -2
+                        times %2 d%3 -6, 46
+                        times %2 d%3 28, -4
+                        times %2 d%3 -4, 36
+                        times %2 d%3 36, -4
+                        times %2 d%3 -4, 28
+                        times %2 d%3 46, -6
+                        times %2 d%3 -2, 16
+                        times %2 d%3 54, -4
+                        times %2 d%3 -2, 10
+                        times %2 d%3 58, -2
+%endmacro
 
-pw_1023: times 8 dw 1023
 
-cextern hevc_qpel_coeffs
-cextern hevc_qpel_coeffs8
+EPEL_TABLE  8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
+
+EPEL_TABLE  8, 8, b, sse4
+EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE 12, 4, w, sse4
+
+%macro QPEL_TABLE 4
+hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
+                        times %2 d%3 -10, 58
+                        times %2 d%3  17, -5
+                        times %2 d%3   1,  0
+                        times %2 d%3  -1,  4
+                        times %2 d%3 -11, 40
+                        times %2 d%3  40,-11
+                        times %2 d%3   4, -1
+                        times %2 d%3   0,  1
+                        times %2 d%3  -5, 17
+                        times %2 d%3  58,-10
+                        times %2 d%3   4, -1
+%endmacro
 
-cextern hevc_epel_coeffs
-cextern hevc_epel_coeffs8
+QPEL_TABLE  8, 8, b, sse4
+QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE 12, 4, w, sse4
 
-cextern pw_8
-cextern pw_16
-cextern pw_32
-cextern pw_64
+QPEL_TABLE  8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
 
 SECTION .text
 
-; %1: width
-; %2: bit depth
-%macro COMMON_DEFS 2
-    %assign blocksize            8
-    %assign nb_blocks            ((%1 + blocksize - 1) / blocksize)
-    %define last_block_truncated (blocksize * nb_blocks > %1)
-    %if %2 > 8
-        %define LOAD_BLOCK     movu
-        %define LOAD_HALFBLOCK movq
-        %assign pixelsize      2
-    %else
-        %define LOAD_BLOCK     movq
-        %define LOAD_HALFBLOCK movd
-        %assign pixelsize      1
-    %endif
-    %define STORE_BLOCK        mova
-    %define STORE_HALFBLOCK    movq
-%endmacro
-
-; %1: block index
-%macro BLOCK_DEFS 1
-    %if last_block_truncated && %1 == nb_blocks - 1
-        %define block_truncated 1
-        %define LOAD            LOAD_HALFBLOCK
-        %define STORE           STORE_HALFBLOCK
-    %else
-        %define block_truncated 0
-        %define LOAD            LOAD_BLOCK
-        %define STORE           STORE_BLOCK
-    %endif
-%endmacro
-
-
-; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
-;                         pixel   *src, ptrdiff_t srcstride,
-;                         int height, int mx, int my, int *mcbuffer)
-
-; %1: block width
-; %2: bit depth
-; %3: log2 of height unroll
-%macro GET_PIXELS 3
-cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
-
-    %assign shift 14 - %2
-    COMMON_DEFS %1, %2
-
-%if pixelsize == 1
-    pxor      m0, m0
-%endif
-
-    shr       heightd, %3
-
-.loop:
+%define MAX_PB_SIZE  64
 
-%assign i 0
-%rep (1 << %3)
+%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 
-%assign j 0
-%rep nb_blocks
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
 
-    BLOCK_DEFS j
+%if ARCH_X86_64
 
-    LOAD       m1, [srcq + j * pixelsize * blocksize]
-%if pixelsize == 1
-    punpcklbw  m1, m0
+%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %3, [%2]                                              ; load data from source2
+%elif %1 <= 8
+    movdqa            %3, [%2]                                              ; load data from source2
+%elif %1 <= 12
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movq              %4, [%2+16]                                           ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movdqa            %4, [%2+16]                                           ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+    mova              %3, [%2]
+    mova              %4, [%2+32]
 %endif
-    psllw      m1, shift
-    STORE      [dstq + j * 2 * blocksize], m1
-
-%assign j (j + 1)
-%endrep
+%endmacro
 
-    add       dstq, dststrideq
-    add       srcq, srcstrideq
+%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movd              %4, [%3]                                               ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movq              %4, [%3]                                               ; load data from source
+%elif notcpuflag(avx)
+    movu              %4, [%3]                                               ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+    movdqu           %4, [%3]
+%else
+    movu              %4, [%3]
+%endif
+%endmacro
 
-%assign i (i + 1)
-%endrep
 
-    dec heightd
-    jg .loop
-    RET
+%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
+%if cpuflag(avx2)
+%assign %%offset 32
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_avx2_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_sse4_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_sse4_%1
+%endif
+%endif ;cpuflag(avx2)
+    sub              %2q, 1
+%if cpuflag(avx2)
+    shl              %2q, 6                      ; multiply by 64
+  %else
+    shl              %2q, 5                      ; multiply by 32
+%endif
+    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
+    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
 %endmacro
 
-INIT_XMM sse2
-GET_PIXELS 4,  8, 1
-GET_PIXELS 8,  8, 1
-GET_PIXELS 12, 8, 3
-GET_PIXELS 16, 8, 2
-GET_PIXELS 24, 8, 3
-GET_PIXELS 32, 8, 3
-GET_PIXELS 48, 8, 3
-GET_PIXELS 64, 8, 3
-
-GET_PIXELS 4,  10, 1
-GET_PIXELS 8,  10, 1
-GET_PIXELS 12, 10, 3
-GET_PIXELS 16, 10, 2
-GET_PIXELS 24, 10, 3
-GET_PIXELS 32, 10, 3
-GET_PIXELS 48, 10, 3
-GET_PIXELS 64, 10, 3
-
-; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                     uint8_t *src, ptrdiff_t srcstride,
-;                     int height, int mx, int my, int *mcbuffer)
-
-; 8-bit qpel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro QPEL_8 2
-%if %2
-    %define postfix    v
-    %define mvfrac     myq
-    %define coeffsaddr r5q
-    %define pixstride  srcstrideq
-    %define pixstride3 r5q
-    %define src_m3     r6q
-%else
-    %define postfix    h
-    %define mvfrac     mxq
-    %define coeffsaddr r6q
-    %define pixstride  1
-    %define pixstride3 3
-    %define src_m3     (srcq - 3)
-%endif
-
-    COMMON_DEFS %1, 8
-
-cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
-    and       mvfrac, 0x3
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsaddr, [hevc_qpel_coeffs8]
-    mova      m0,         [coeffsaddr + mvfrac]
-
-    SPLATW    m1, m0, 1
-    SPLATW    m2, m0, 2
-    SPLATW    m3, m0, 3
-    SPLATW    m0, m0, 0
-
-%if %2
-    lea       pixstride3, [srcstrideq + 2 * srcstrideq]
-    mov       src_m3, srcq
-    sub       src_m3, pixstride3
+%macro EPEL_HV_FILTER 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  6
+%define %%table  hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  5
+%define %%table  hevc_epel_filters_sse4_%1
 %endif
 
-.loop:
-
-%assign i 0
-%rep nb_blocks
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
+    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
+
+%if cpuflag(avx2)
+%define %%table  hevc_epel_filters_avx2_10
+%else
+%define %%table  hevc_epel_filters_sse4_10
+%endif
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    mova             m12, [FILTER + myq]        ; get 2 first values of filters
+    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
+%endmacro
 
-    BLOCK_DEFS i
+%macro QPEL_FILTER 2
 
-    LOAD m4, [src_m3 + i * blocksize]
-    LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
-    punpcklbw m4, m5
-    pmaddubsw m4, m0
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  7
+%define %%table  hevc_qpel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  6
+%define %%table  hevc_qpel_filters_sse4_%1
+%endif
 
-    LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
-    LOAD m6, [srcq   + i * blocksize]
-    punpcklbw m5, m6
-    pmaddubsw m5, m1
-    paddsw    m4, m5
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
+    sub              %2q, 1
+    shl              %2q, %%shift                        ; multiply by 32
+    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
+    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
+    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
+    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
+%endmacro
 
-    LOAD m5, [srcq + i * blocksize + 1 * pixstride]
-    LOAD m6, [srcq + i * blocksize + 2 * pixstride]
-    punpcklbw m5, m6
-    pmaddubsw m5, m2
-    paddsw    m4, m5
+%macro EPEL_LOAD 4
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
 
-    LOAD m5, [srcq + i * blocksize +     pixstride3]
-    LOAD m6, [srcq + i * blocksize + 4 * pixstride]
-    punpcklbw m5, m6
-    pmaddubsw m5, m3
-    paddsw    m4, m5
+    %%load            m0, [%2q ]
+%ifnum %3
+    %%load            m1, [%2q+  %3]
+    %%load            m2, [%2q+2*%3]
+    %%load            m3, [%2q+3*%3]
+%else
+    %%load            m1, [%2q+  %3q]
+    %%load            m2, [%2q+2*%3q]
+    %%load            m3, [%2q+r3srcq]
+%endif
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 7
+    SBUTTERFLY        bw, 2, 3, 7
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 7
+    SBUTTERFLY        wd, 2, 3, 7
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+%endif
+%endif
+%endmacro
 
-    STORE [dstq + i * 2 * blocksize], m4
 
-%assign i (i + 1)
-%endrep
+%macro QPEL_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%endif
+    %%load            m0, [%2-3*%%stride]        ;load data from source
+    %%load            m1, [%2-2*%%stride]
+    %%load            m2, [%2-%%stride  ]
+    %%load            m3, [%2           ]
+    %%load            m4, [%2+%%stride  ]
+    %%load            m5, [%2+2*%%stride]
+    %%load            m6, [%2+3*%%stride]
+    %%load            m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+    SBUTTERFLY        wd, 0, 1, %4
+    SBUTTERFLY        wd, 2, 3, %4
+    SBUTTERFLY        wd, 4, 5, %4
+    SBUTTERFLY        wd, 6, 7, %4
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %3 > 4
+    SBUTTERFLY        dq, 0, 1, %4
+    SBUTTERFLY        dq, 2, 3, %4
+    SBUTTERFLY        dq, 4, 5, %4
+    SBUTTERFLY        dq, 6, 7, %4
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
 
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-%if %2
-    add       src_m3, srcstrideq
+%macro QPEL_V_LOAD 5
+    lea              %5q, [%2]
+    sub              %5q, r3srcq
+    movu              m0, [%5q            ]      ;load x- 3*srcstride
+    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
+    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
+    movu              m3, [%2       ]      ;load x
+    movu              m4, [%2+   %3q]      ;load x+stride
+    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
+    movu              m6, [%2+r3srcq]      ;load x+3*stride
+    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 8
+    SBUTTERFLY        bw, 2, 3, 8
+    SBUTTERFLY        bw, 4, 5, 8
+    SBUTTERFLY        bw, 6, 7, 8
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
 %endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 8
+    SBUTTERFLY        wd, 2, 3, 8
+    SBUTTERFLY        wd, 4, 5, 8
+    SBUTTERFLY        wd, 6, 7, 8
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
 
-    dec heightd
-    jg .loop
-    RET
+%macro PEL_12STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+    PEL_12STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
 %endmacro
 
-INIT_XMM ssse3
-QPEL_8 4,  0
-QPEL_8 8,  0
-QPEL_8 12, 0
-QPEL_8 16, 0
-QPEL_8 24, 0
-QPEL_8 32, 0
-QPEL_8 48, 0
-QPEL_8 64, 0
-
-QPEL_8 4,  1
-QPEL_8 8,  1
-QPEL_8 12, 1
-QPEL_8 16, 1
-QPEL_8 24, 1
-QPEL_8 32, 1
-QPEL_8 48, 1
-QPEL_8 64, 1
-
-; 16-bit qpel interpolation
-; %1: block width
-; %2: shift applied to the result
-; %3: 0 - horizontal; 1 - vertical
-%macro QPEL_16 3
-%if %3
-    %define mvfrac     myq
-    %define pixstride  srcstrideq
-    %define pixstride3 sstride3q
-    %define src_m3     srcm3q
-%else
-    %define mvfrac     mxq
-    %define pixstride  2
-    %define pixstride3 6
-    %define src_m3     (srcq - 6)
-%endif
-
-    COMMON_DEFS %1, 16
-
-    and       mvfrac, 0x3
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsregq, [hevc_qpel_coeffs]
-    mova      m0,         [coeffsregq + mvfrac]
-
-    pshufd    m1, m0, 0x55
-    pshufd    m2, m0, 0xaa
-    pshufd    m3, m0, 0xff
-    pshufd    m0, m0, 0x00
-
-%if %3
-    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
-    mov       srcm3q, srcq
-    sub       srcm3q, sstride3q
+%macro PEL_10STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+    movu            [%1], %2
+%else
+    PEL_10STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
 %endif
+%endmacro
 
-.loop:
+%macro PEL_10STORE32 3
+    PEL_10STORE16     %1, %2, %3
+    movu         [%1+32], %3
+%endmacro
 
-%assign i 0
-%rep nb_blocks
+%macro PEL_8STORE2 3
+    pextrw          [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+    movd            [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+    movd            [%1], %2
+    pextrw        [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+    movq            [%1], %2
+    psrldq            %2, 8
+    movd          [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+%if cpuflag(avx2)
+    movdqu        [%1], %2
+%else
+    mova          [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+    movu          [%1], %2
+%endmacro
 
-    BLOCK_DEFS i
+%macro LOOP_END 3
+    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
+    add              %2q, %3q                    ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+%endmacro
 
-    LOAD m4,  [src_m3 + i * 2 * blocksize]
-    LOAD m5,  [src_m3 + i * 2 * blocksize + 1 * pixstride]
-    LOAD m6,  [src_m3 + i * 2 * blocksize + 2 * pixstride]
-    LOAD m7,  [srcq   + i * 2 * blocksize + 0 * pixstride]
-    LOAD m8,  [srcq   + i * 2 * blocksize + 1 * pixstride]
-    LOAD m9,  [srcq   + i * 2 * blocksize + 2 * pixstride]
-    LOAD m10, [srcq   + i * 2 * blocksize +     pixstride3]
-    LOAD m11, [srcq   + i * 2 * blocksize + 4 * pixstride]
 
-    punpcklwd m12, m4, m5
-    pmaddwd   m12, m0
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+    vextracti128 xm1, m0, 1
+    pmovzxbw      m1, xm1
+    psllw         m1, 14-%2
+%endif
+    pmovzxbw      m0, xm0
+%else ; not avx
+%if %1 > 8
+    punpckhbw     m1, m0, m2
+    psllw         m1, 14-%2
+%endif
+    punpcklbw     m0, m2
+%endif
+%endif ;avx
+    psllw         m0, 14-%2
+%endmacro
+
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
+%if %1 == 8
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+    vperm2i128    m10, m0, m1, q0301
+%endif
+    vinserti128    m0, m0, xm1, 1
+    mova           m1, m10
+%if %2 > 16
+    vperm2i128    m10, m2, m3, q0301
+%endif
+    vinserti128    m2, m2, xm3, 1
+    mova           m3, m10
+%endif
+    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
+    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
+    paddw          %%reg0, %%reg2
+%if %2 > 8
+    pmaddubsw      %%reg1, %3
+    pmaddubsw      %%reg3, %4
+    paddw          %%reg1, %%reg3
+%endif
+%else
+    pmaddwd        %%reg0, %3
+    pmaddwd        %%reg2, %4
+    paddd          %%reg0, %%reg2
+%if %2 > 4
+    pmaddwd        %%reg1, %3
+    pmaddwd        %%reg3, %4
+    paddd          %%reg1, %%reg3
+%if %1 != 8
+    psrad          %%reg1, %1-8
+%endif
+%endif
+%if %1 != 8
+    psrad          %%reg0, %1-8
+%endif
+    packssdw       %%reg0, %%reg1
+%endif
+%endmacro
 
-    punpcklwd m13, m6, m7
-    pmaddwd   m13, m1
-    paddd     m12, m13
+%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
 
-    punpcklwd m13, m8, m9
-    pmaddwd   m13, m2
-    paddd     m12, m13
+%if cpuflag(avx2)
+%assign %%offset 32
+%define %%table  hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table  hevc_qpel_filters_sse4_%2
+%endif
 
-    punpcklwd m13, m10, m11
-    pmaddwd   m13, m3
-    paddd     m12, m13
-    psrad     m12, %2
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
 
-    %if block_truncated == 0
-        punpckhwd m4, m5
-        pmaddwd   m4, m0
+%if %2 == 8
+    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
+    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
+    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
+    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%else
+    pmaddwd           m0, [rfilterq + %3q*8   ]
+    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, [rfilterq + %3q*8   ]
+    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+    p%4               m0, m1
+%endif
+%endmacro
 
-        punpckhwd m6, m7
-        pmaddwd   m6, m1
-        paddd     m4, m6
+%macro QPEL_COMPUTE 2-3     ; width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
 
-        punpckhwd m8, m9
-        pmaddwd   m8, m2
-        paddd     m4, m8
+    vperm2i128 m10, m0,  m1, q0301
+    vinserti128 m0, m0, xm1, 1
+    SWAP 1, 10
 
-        punpckhwd m10, m11
-        pmaddwd   m10, m3
-        paddd     m4, m10
+    vperm2i128 m10, m2,  m3, q0301
+    vinserti128 m2, m2, xm3, 1
+    SWAP 3, 10
 
-        psrad     m4, %2
-    %endif
-    packssdw  m12, m4
-    STORE [dstq + i * 2 * blocksize], m12
 
-%assign i (i + 1)
-%endrep
+    vperm2i128 m10, m4,  m5, q0301
+    vinserti128 m4, m4, xm5, 1
+    SWAP 5, 10
 
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-%if %3
-    add       srcm3q, srcstrideq
+    vperm2i128 m10, m6,  m7, q0301
+    vinserti128 m6, m6, xm7, 1
+    SWAP 7, 10
 %endif
 
-    dec heightd
-    jg .loop
-    RET
+    pmaddubsw         m0, m12   ;x1*c1+x2*c2
+    pmaddubsw         m2, m13   ;x3*c3+x4*c4
+    pmaddubsw         m4, m14   ;x5*c5+x6*c6
+    pmaddubsw         m6, m15   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%if %1 > 8
+    pmaddubsw         m1, m12
+    pmaddubsw         m3, m13
+    pmaddubsw         m5, m14
+    pmaddubsw         m7, m15
+    paddw             m1, m3
+    paddw             m5, m7
+    paddw             m1, m5
+%endif
+%else
+    pmaddwd           m0, m12
+    pmaddwd           m2, m13
+    pmaddwd           m4, m14
+    pmaddwd           m6, m15
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, m12
+    pmaddwd           m3, m13
+    pmaddwd           m5, m14
+    pmaddwd           m7, m15
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+%endif
 %endmacro
 
-%if ARCH_X86_64
-
-%macro QPEL_H_10 1
-cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
-QPEL_16 %1, 2, 0
-%endmacro
-
-INIT_XMM avx
-QPEL_H_10 4
-QPEL_H_10 8
-QPEL_H_10 12
-QPEL_H_10 16
-QPEL_H_10 24
-QPEL_H_10 32
-QPEL_H_10 48
-QPEL_H_10 64
-
-%macro QPEL_V_10 1
-cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 2, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_V_10 4
-QPEL_V_10 8
-QPEL_V_10 12
-QPEL_V_10 16
-QPEL_V_10 24
-QPEL_V_10 32
-QPEL_V_10 48
-QPEL_V_10 64
-
-; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
-;                  uint8_t *src, ptrdiff_t srcstride,
-;                  int height, int mx, int my, int *mcbuffer)
-
-%macro QPEL_HV 1
-cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 6, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_HV 4
-QPEL_HV 8
-QPEL_HV 12
-QPEL_HV 16
-QPEL_HV 24
-QPEL_HV 32
-QPEL_HV 48
-QPEL_HV 64
-
-%endif ; ARCH_X86_64
-
-; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                     uint8_t *src, ptrdiff_t srcstride,
-;                     int height, int mx, int my, int *mcbuffer)
+%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    paddsw            %3, %5
+%if %1 > 8
+    paddsw            %4, %6
+%endif
+    UNI_COMPUTE       %1, %2, %3, %4, %7
+%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
+    vpermq            %3, %3, 216
+    vpermq            %4, %4, 216
+%endif
+%endmacro
 
-; 8-bit epel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro EPEL_8 2
-%if %2
-    %define postfix    v
-    %define mvfrac     myq
-    %define coeffsaddr r5q
-    %define pixstride  srcstrideq
-    %define pixstride3 r5q
+%macro UNI_COMPUTE 5
+    pmulhrsw          %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+    pmulhrsw          %4, %5
+%endif
+%if %2 == 8
+    packuswb          %3, %4
 %else
-    %define postfix    h
-    %define mvfrac     mxq
-    %define coeffsaddr r6q
-    %define pixstride  1
-    %define pixstride3 3
+    CLIPW             %3, [pb_0], [max_pixels_%2]
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
+    CLIPW             %4, [pb_0], [max_pixels_%2]
+%endif
 %endif
+%endmacro
+
 
-    COMMON_DEFS %1, 8
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+;                         uint8_t *_src, ptrdiff_t _srcstride,
+;                         int height, int mx, int my)
+; ******************************
 
-cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
-    and       mvfrac, 0x7
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsaddr, [hevc_epel_coeffs8]
-    movq      m0,         [coeffsaddr + mvfrac]
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS     %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS  %1, %2
+%endmacro
 
-    SPLATW    m1, m0, 1
-    SPLATW    m0, m0, 0
+%macro HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+    pxor               m2, m2
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+ %endmacro
 
-%if %2
-    lea       pixstride3, [srcstrideq + 2 * srcstrideq]
-%endif
-    sub       srcq, pixstride
+%macro HEVC_UNI_PEL_PIXELS 2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
+%macro HEVC_BI_PEL_PIXELS 2
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+    pxor              m2, m2
+    movdqa            m5, [pw_bi_%2]
 .loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    SIMPLE_BILOAD     %1, src2q, m3, m4
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-%assign i 0
-%rep nb_blocks
 
-    BLOCK_DEFS i
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width);
+; ******************************
 
-    LOAD m2, [srcq + i * blocksize + 0 * pixstride]
-    LOAD m3, [srcq + i * blocksize + 1 * pixstride]
-    LOAD m4, [srcq + i * blocksize + 2 * pixstride]
-    LOAD m5, [srcq + i * blocksize +     pixstride3]
 
-    punpcklbw m2, m3
-    punpcklbw m4, m5
+%macro HEVC_PUT_HEVC_EPEL 2
+%if cpuflag(avx2)
+%define XMM_REGS  11
+%else
+%define XMM_REGS  8
+%endif
+
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1      dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
 
-    pmaddubsw m2, m0
-    pmaddubsw m4, m1
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    paddsw    m2, m4
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m6, [pw_bi_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    STORE [dstq + i * 2 * blocksize], m2
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+;                      uint8_t *_src, ptrdiff_t _srcstride,
+;                      int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
 
-%assign i (i + 1)
-%endrep
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    add       dstq, dststrideq
-    add       srcq, srcstrideq
 
-    dec heightd
-    jg .loop
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_bi_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-INIT_XMM ssse3
-EPEL_8 4,  0
-EPEL_8 8,  0
-EPEL_8 12, 0
-EPEL_8 16, 0
-EPEL_8 24, 0
-EPEL_8 32, 0
-
-EPEL_8 4,  1
-EPEL_8 8,  1
-EPEL_8 12, 1
-EPEL_8 16, 1
-EPEL_8 24, 1
-EPEL_8 32, 1
 
-%macro EPEL_16 3
-%if %3
-    %define mvfrac     myq
-    %define pixstride  srcstrideq
-    %define pixstride3 sstride3q
+; ******************************
+; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+%macro HEVC_PUT_HEVC_EPEL_HV 2
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m2, m0, xm4, 1
+    vperm2i128        m3, m0, m4, q0301
+    PEL_10STORE%1     dstq, m2, m3
 %else
-    %define mvfrac     mxq
-    %define pixstride  2
-    %define pixstride3 6
+    PEL_10STORE%1     dstq, m0, m4
 %endif
+%else
+    PEL_10STORE%1     dstq, m0, m1
+%endif
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
 
-    COMMON_DEFS %1, 16
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    mova              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
+%else
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    and       mvfrac, 0x7
-    dec       mvfrac
-    shl       mvfrac, 5
-    lea       coeffsregq, [hevc_epel_coeffs]
-    mova      m0, [coeffsregq + mvfrac]
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    SIMPLE_BILOAD     %1, src2q, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m1, m8, xm3, 1
+    vperm2i128        m2, m8, m3, q0301
+    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
+    SIMPLE_BILOAD     %1, src2q, m8, m9
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m4
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-    pshufd    m1, m0, 0x55
-    pshufd    m0, m0, 0x00
+; ******************************
+; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
 
-%if %3
-    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+%macro HEVC_PUT_HEVC_QPEL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
 %endif
-    sub       srcq, pixstride
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
 
+cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
+    mova              m9, [pw_%2]
+    QPEL_FILTER       %2, mx
 .loop:
-
-%assign i 0
-%rep nb_blocks
-
-    BLOCK_DEFS i
-
-    LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
-    LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
-    LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
-    LOAD m5, [srcq + i * 2 * blocksize +     pixstride3]
-
-    punpcklwd m6, m2, m3
-    punpcklwd m7, m4, m5
-    pmaddwd   m6, m0
-    pmaddwd   m7, m1
-    paddd     m6, m7
-    psrad     m6, %2
-
-    %if block_truncated == 0
-        punpckhwd m2, m3
-        punpckhwd m4, m5
-        pmaddwd   m2, m0
-        pmaddwd   m4, m1
-        paddd     m2, m4
-        psrad     m2, %2
-    %endif
-    packssdw  m6, m2
-    STORE [dstq + i * 2 * blocksize], m6
-
-%assign i (i + 1)
-%endrep
-
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-
-    dec heightd
-    jg .loop
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
-%endmacro
 
-%if ARCH_X86_64
-
-%macro EPEL_H_10 1
-cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 0
-%endmacro
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m9, [pw_bi_%2]
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-INIT_XMM avx
-EPEL_H_10 4
-EPEL_H_10 8
-EPEL_H_10 12
-EPEL_H_10 16
-EPEL_H_10 24
-EPEL_H_10 32
 
-%macro EPEL_V_10 1
-cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 1
-%endmacro
+; ******************************
+; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
 
-INIT_XMM avx
-EPEL_V_10 4
-EPEL_V_10 8
-EPEL_V_10 12
-EPEL_V_10 16
-EPEL_V_10 24
-EPEL_V_10 32
+cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
 
-; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                    int16_t *src, ptrdiff_t srcstride,
-;                    int height, int mx, int my, int *mcbuffer)
+cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%macro EPEL_HV 1
-cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 6, 1
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_bi_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 %endmacro
 
-INIT_XMM avx
-EPEL_HV 4
-EPEL_HV 8
-EPEL_HV 12
-EPEL_HV 16
-EPEL_HV 24
-EPEL_HV 32
 
-%endif ; ARCH_X86_64
+; ******************************
+; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_QPEL_HV 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    PEL_10STORE%1     dstq, m0, m1
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
 
-; hevc_put_unweighted_pred_<w>_<d>(pixel   *dst, ptrdiff_t dststride,
-;                                  int16_t *src, ptrdiff_t srcstride,
-;                                  int height)
+cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    mova            m8, m9
+    mova            m9, m10
+    mova           m10, m11
+    mova           m11, m12
+    mova           m12, m13
+    mova           m13, m14
+    mova           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%macro AVG 5
-    %if %3
-        %if %4 == 4
-            movq %5, %2
-            paddsw %1, %5
-        %else
-            paddsw %1, %2
-        %endif
-    %endif
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 %endmacro
 
-; %1: 0 - one source; 1 - two sources
-; %2: width
-; %3: bit depth
-%macro PUT_PRED 3
-%if %1
-cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
+%macro WEIGHTING_FUNCS 2
+%if WIN64 || ARCH_X86_32
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
+    mov             r4d, denomm
+%define SHIFT  r4d
 %else
-cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
+%define SHIFT  denomd
 %endif
+    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
+%if %1 <= 4
+    pxor             m1, m1
+%endif
+    movd             m2, wxm        ; WX
+    movd             m4, SHIFT      ; shift
+%if %1 <= 4
+    punpcklwd        m2, m1
+%else
+    punpcklwd        m2, m2
+%endif
+    dec           SHIFT
+    movdqu           m5, [pd_1]
+    movd             m6, SHIFT
+    pshufd           m2, m2, 0
+    mov           SHIFT, oxm
+    pslld            m5, m6
+%if %2 != 8
+    shl           SHIFT, %2-8       ; ox << (bitd - 8)
+%endif
+    movd             m3, SHIFT      ; OX
+    pshufd           m3, m3, 0
+%if WIN64 || ARCH_X86_32
+    mov           SHIFT, heightm
+%endif
+.loop:
+   SIMPLE_LOAD        %1, 10, srcq, m0
+%if %1 <= 4
+    punpcklwd         m0, m1
+    pmaddwd           m0, m2
+    paddd             m0, m5
+    psrad             m0, m4
+    paddd             m0, m3
+%else
+    pmulhw            m6, m0, m2
+    pmullw            m0, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    paddd             m0, m5
+    paddd             m1, m5
+    psrad             m0, m4
+    psrad             m1, m4
+    paddd             m0, m3
+    paddd             m1, m3
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+    CLIPW             m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%assign shift       14 + %1 - %3
-%assign offset      (1 << (shift - 1))
-%define offset_data pw_ %+ offset
-
-    mova        m0, [offset_data]
-
-%if %3 > 8
-    %define STORE_BLOCK movu
-    %define STORE_HALF  movq
-
-    %assign pixel_max ((1 << %3) - 1)
-    %define pw_pixel_max pw_ %+ pixel_max
-    pxor    m1, m1
-    mova    m2, [pw_pixel_max]
+cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
+    movifnidn        r5d, denomm
+%if %1 <= 4
+    pxor              m1, m1
+%endif
+    movd              m2, wx0m         ; WX0
+    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
+    movd              m3, wx1m         ; WX1
+    movd              m0, r5d          ; shift
+%if %1 <= 4
+    punpcklwd         m2, m1
+    punpcklwd         m3, m1
+%else
+    punpcklwd         m2, m2
+    punpcklwd         m3, m3
+%endif
+    inc              r5d
+    movd              m5, r5d          ; shift+1
+    pshufd            m2, m2, 0
+    mov              r5d, ox0m
+    pshufd            m3, m3, 0
+    add              r5d, ox1m
+%if %2 != 8
+    shl              r5d, %2-8         ; ox << (bitd - 8)
+%endif
+    inc              r5d
+    movd              m4, r5d          ; offset
+    pshufd            m4, m4, 0
+%if UNIX64
+%define h heightd
 %else
-    %define STORE_BLOCK movq
-    %define STORE_HALF  movd
+    mov              r5d, heightm
+%define h r5d
 %endif
+    pslld             m4, m0
 
 .loop:
-%assign i 0
-%rep (%2 + 7) / 8
-
-    %if (i + 1) * 8 > %2
-        %define LOAD movq
-        %define STORE STORE_HALF
-    %else
-        %define LOAD mova
-        %define STORE STORE_BLOCK
-    %endif
-
-    LOAD m3, [srcq  + 16 * i]
-    AVG  m3, [src2q + 16 * i], %1, %3 - i * 8, m4
-
-    paddsw m3, m0
-    psraw  m3, shift
-
-    %if %3 == 8
-        packuswb m3, m3
-        STORE [dstq + 8 * i], m3
-    %else
-        CLIPW m3, m1, m2
-        STORE [dstq + 16 * i], m3
-    %endif
-%assign i (i + 1)
-%endrep
-
-    add dstq,  dststrideq
-    add srcq,  srcstrideq
-%if %1
-    add src2q, srcstrideq
-%endif
-
-    dec         heightd
-    jg          .loop
+   SIMPLE_LOAD        %1, 10, srcq,  m0
+   SIMPLE_LOAD        %1, 10, src2q, m8
+%if %1 <= 4
+    punpcklwd         m0, m1
+    punpcklwd         m8, m1
+    pmaddwd           m0, m3
+    pmaddwd           m8, m2
+    paddd             m0, m4
+    paddd             m0, m8
+    psrad             m0, m5
+%else
+    pmulhw            m6, m0, m3
+    pmullw            m0, m3
+    pmulhw            m7, m8, m2
+    pmullw            m8, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    punpckhwd         m9, m8, m7
+    punpcklwd         m8, m7
+    paddd             m0, m8
+    paddd             m1, m9
+    paddd             m0, m4
+    paddd             m1, m4
+    psrad             m0, m5
+    psrad             m1, m5
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+     CLIPW            m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
+    dec                h                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-INIT_XMM sse2
-PUT_PRED 0, 4,  8
-PUT_PRED 1, 4,  8
-PUT_PRED 0, 8,  8
-PUT_PRED 1, 8,  8
-PUT_PRED 0, 12, 8
-PUT_PRED 1, 12, 8
-PUT_PRED 0, 16, 8
-PUT_PRED 1, 16, 8
-PUT_PRED 0, 24, 8
-PUT_PRED 1, 24, 8
-PUT_PRED 0, 32, 8
-PUT_PRED 1, 32, 8
-PUT_PRED 0, 48, 8
-PUT_PRED 1, 48, 8
-PUT_PRED 0, 64, 8
-PUT_PRED 1, 64, 8
-
-PUT_PRED 0, 4,  10
-PUT_PRED 1, 4,  10
-PUT_PRED 0, 8,  10
-PUT_PRED 1, 8,  10
-PUT_PRED 0, 12, 10
-PUT_PRED 1, 12, 10
-PUT_PRED 0, 16, 10
-PUT_PRED 1, 16, 10
-PUT_PRED 0, 24, 10
-PUT_PRED 1, 24, 10
-PUT_PRED 0, 32, 10
-PUT_PRED 1, 32, 10
-PUT_PRED 0, 48, 10
-PUT_PRED 1, 48, 10
-PUT_PRED 0, 64, 10
-PUT_PRED 1, 64, 10
-
-%macro PUT_WEIGHTED_PRED 3
-%if %1
-cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
-%else
-cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
-%endif
-
-    and         denomd, 0xff
-    movsx       weight0d, weight0w
-    movsx       offset0d, offset0w
-%if %1
-    movsx       weight1d, weight1w
-    movsx       offset1d, offset1w
-%endif
-
-    add         denomd, 14 + %1 - %3
-    movd        m0, denomd
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+
+WEIGHTING_FUNCS 2, 8
+WEIGHTING_FUNCS 4, 8
+WEIGHTING_FUNCS 6, 8
+WEIGHTING_FUNCS 8, 8
+
+WEIGHTING_FUNCS 2, 10
+WEIGHTING_FUNCS 4, 10
+WEIGHTING_FUNCS 6, 10
+WEIGHTING_FUNCS 8, 10
 
-%if %3 > 8
-    %assign     pixel_max ((1 << %3) - 1)
-    %define     pw_pixel_max pw_ %+ pixel_max
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
+WEIGHTING_FUNCS 2, 12
+WEIGHTING_FUNCS 4, 12
+WEIGHTING_FUNCS 6, 12
+WEIGHTING_FUNCS 8, 12
 
-    shl         offset0d, %3 - 8
-%if %1
-    shl         offset1d, %3 - 8
-%endif
-%endif
+HEVC_PUT_HEVC_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
 
-%if %1
-    lea         offset0d, [offset0d + offset1d + 1]
-%else
-    lea         offset0d, [2 * offset0d + 1]
-%endif
-    movd        m1, offset0d
-    SPLATD      m1
-    pslld       m1, m0
-    psrad       m1, 1
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
 
-    movd        m2, weight0d
-    SPLATD      m2
-%if %1
-    movd        m3, weight1d
-    SPLATD      m3
-%endif
+HEVC_PUT_HEVC_PEL_PIXELS 2, 12
+HEVC_PUT_HEVC_PEL_PIXELS 4, 12
+HEVC_PUT_HEVC_PEL_PIXELS 6, 12
+HEVC_PUT_HEVC_PEL_PIXELS 8, 12
 
-.loop:
-%assign i 0
-%rep (%2 + 3) / 4
+HEVC_PUT_HEVC_EPEL 2,  8
+HEVC_PUT_HEVC_EPEL 4,  8
+HEVC_PUT_HEVC_EPEL 6,  8
+HEVC_PUT_HEVC_EPEL 8,  8
+HEVC_PUT_HEVC_EPEL 12, 8
+HEVC_PUT_HEVC_EPEL 16, 8
 
-    pmovsxwd   m6, [src0q + 8 * i]
-    pmulld     m6, m2
 
-%if %1
-    pmovsxwd   m7, [src1q + 8 * i]
-    pmulld     m7, m3
-    paddd      m6, m7
-%endif
+HEVC_PUT_HEVC_EPEL 2, 10
+HEVC_PUT_HEVC_EPEL 4, 10
+HEVC_PUT_HEVC_EPEL 6, 10
+HEVC_PUT_HEVC_EPEL 8, 10
 
-    paddd      m6, m1
-    psrad      m6, m0
+HEVC_PUT_HEVC_EPEL 2, 12
+HEVC_PUT_HEVC_EPEL 4, 12
+HEVC_PUT_HEVC_EPEL 6, 12
+HEVC_PUT_HEVC_EPEL 8, 12
 
-    packssdw   m6, m6
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
 
-%if %3 > 8
-    CLIPW      m6, m4, m5
-    movq       [dstq + 8 * i], m6
-%else
-    packuswb   m6, m6
-    movd [dstq + 4 * i], m6
-%endif
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
 
-%assign i (i + 1)
-%endrep
+HEVC_PUT_HEVC_EPEL_HV 2, 12
+HEVC_PUT_HEVC_EPEL_HV 4, 12
+HEVC_PUT_HEVC_EPEL_HV 6, 12
+HEVC_PUT_HEVC_EPEL_HV 8, 12
 
-    add dstq,  dststrideq
-    add src0q, srcstrideq
-%if %1
-    add src1q, srcstrideq
-%endif
+HEVC_PUT_HEVC_QPEL 4,  8
+HEVC_PUT_HEVC_QPEL 8,  8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
 
-    dec         heightd
-    jg          .loop
-    RET
-%endmacro
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
 
-%if ARCH_X86_64
-INIT_XMM sse4
-PUT_WEIGHTED_PRED 0, 4,  8
-PUT_WEIGHTED_PRED 1, 4,  8
-PUT_WEIGHTED_PRED 0, 8,  8
-PUT_WEIGHTED_PRED 1, 8,  8
-PUT_WEIGHTED_PRED 0, 12, 8
-PUT_WEIGHTED_PRED 1, 12, 8
-PUT_WEIGHTED_PRED 0, 16, 8
-PUT_WEIGHTED_PRED 1, 16, 8
-PUT_WEIGHTED_PRED 0, 24, 8
-PUT_WEIGHTED_PRED 1, 24, 8
-PUT_WEIGHTED_PRED 0, 32, 8
-PUT_WEIGHTED_PRED 1, 32, 8
-PUT_WEIGHTED_PRED 0, 48, 8
-PUT_WEIGHTED_PRED 1, 48, 8
-PUT_WEIGHTED_PRED 0, 64, 8
-PUT_WEIGHTED_PRED 1, 64, 8
-
-PUT_WEIGHTED_PRED 0, 4,  10
-PUT_WEIGHTED_PRED 1, 4,  10
-PUT_WEIGHTED_PRED 0, 8,  10
-PUT_WEIGHTED_PRED 1, 8,  10
-PUT_WEIGHTED_PRED 0, 12, 10
-PUT_WEIGHTED_PRED 1, 12, 10
-PUT_WEIGHTED_PRED 0, 16, 10
-PUT_WEIGHTED_PRED 1, 16, 10
-PUT_WEIGHTED_PRED 0, 24, 10
-PUT_WEIGHTED_PRED 1, 24, 10
-PUT_WEIGHTED_PRED 0, 32, 10
-PUT_WEIGHTED_PRED 1, 32, 10
-PUT_WEIGHTED_PRED 0, 48, 10
-PUT_WEIGHTED_PRED 1, 48, 10
-PUT_WEIGHTED_PRED 0, 64, 10
-PUT_WEIGHTED_PRED 1, 64, 10
+HEVC_PUT_HEVC_QPEL 4, 12
+HEVC_PUT_HEVC_QPEL 8, 12
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 12
+HEVC_PUT_HEVC_QPEL_HV 4, 12
+HEVC_PUT_HEVC_QPEL_HV 6, 12
+HEVC_PUT_HEVC_QPEL_HV 8, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
 
+%endif ;AVX2
 %endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
new file mode 100644
index 0000000000..888a28afa7
--- /dev/null
+++ b/libavcodec/x86/hevc_sao.asm
@@ -0,0 +1,340 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 0
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+    psraw             %1, %2, 3
+%if ARCH_X86_64
+    pcmpeqw          m10, %1, m0
+    pcmpeqw          m11, %1, m1
+    pcmpeqw          m12, %1, m2
+    pcmpeqw           %1, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %1, m7
+    por              m10, m11
+    por              m12, %1
+    por              m10, m12
+    paddw             %2, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
+    pcmpeqw           %1, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %1, m7
+    por               m4, m5
+    por               m6, %1
+    por               m4, m6
+    paddw             %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 2
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %1 == 8
+    movq              m8, [srcq]
+    punpcklbw         m8, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+    packuswb          m8, m14
+    movq          [dstq], m8
+%endif ; %1 == 8
+
+%assign i 0
+%rep %2
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    REP_RET
+%endmacro
+
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+HEVC_SAO_BAND_FILTER 32, 2
+HEVC_SAO_BAND_FILTER 48, 2
+HEVC_SAO_BAND_FILTER 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 32, 1
+HEVC_SAO_BAND_FILTER 48, 1
+HEVC_SAO_BAND_FILTER 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE
+    imul       b_strideq, EDGE_SRCSTRIDE
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
+    pminub            m4, m1, m2
+    pminub            m5, m1, m3
+    pcmpeqb           m2, m4
+    pcmpeqb           m3, m5
+    pcmpeqb           m4, m1
+    pcmpeqb           m5, m1
+    psubb             m4, m2
+    psubb             m5, m3
+    paddb             m4, m6
+    paddb             m4, m5
+
+    pshufb            m2, m0, m4
+%if %1 > 8
+    punpckhbw         m5, m7, m1
+    punpckhbw         m4, m2, m7
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m5, m4
+    pmaddubsw         m3, m2
+    packuswb          m3, m5
+%else
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m3, m2
+    packuswb          m3, m3
+%endif
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                             int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 2-3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+    vbroadcasti128    m0, [offsetq]
+%else
+    movu              m0, [offsetq]
+%endif
+    mova              m1, [pb_edge_shuffle]
+    packsswb          m0, m0
+    mova              m7, [pb_1]
+    pshufb            m0, m1
+    mova              m6, [pb_2]
+%if ARCH_X86_32
+    mov          heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %1 == 8
+    movq              m1, [srcq]
+    movq              m2, [srcq + a_strideq]
+    movq              m3, [srcq + b_strideq]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    movq          [dstq], m3
+%endif
+
+%assign i 0
+%rep %2
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mov%3     [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mova      [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+HEVC_SAO_EDGE_FILTER  8, 0
+HEVC_SAO_EDGE_FILTER 16, 1, a
+HEVC_SAO_EDGE_FILTER 32, 2, a
+HEVC_SAO_EDGE_FILTER 48, 2, a
+HEVC_SAO_EDGE_FILTER 64, 4, a
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 32, 1, a
+HEVC_SAO_EDGE_FILTER 48, 1, u
+HEVC_SAO_EDGE_FILTER 64, 2, a
+%endif
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
new file mode 100644
index 0000000000..f81e2d5033
--- /dev/null
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -0,0 +1,370 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2:     times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 1
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    mova             m13, [pw_mask %+ %1]
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    mova              m1, [pw_mask %+ %1]
+    pxor              m0, m0
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 3
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT %1
+
+align 16
+.loop:
+
+%assign i 0
+%assign j 0
+%rep %3
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+    mova          m %+ k, [srcq + i]
+    psraw         m %+ l, m %+ k, %1-5
+%if ARCH_X86_64
+    pcmpeqw          m10, m %+ l, m0
+    pcmpeqw          m11, m %+ l, m1
+    pcmpeqw          m12, m %+ l, m2
+    pcmpeqw       m %+ l, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand          m %+ l, m7
+    por              m10, m11
+    por              m12, m %+ l
+    por              m10, m12
+    paddw         m %+ k, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
+    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
+    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
+    pcmpeqw       m %+ l, [rsp+mmsize*3]
+    pand              m4, [rsp+mmsize*4]
+    pand              m5, [rsp+mmsize*5]
+    pand              m6, [rsp+mmsize*6]
+    pand          m %+ l, m7
+    por               m4, m5
+    por               m6, m %+ l
+    por               m4, m6
+    paddw         m %+ k, m4
+%endif ; ARCH
+    CLIPW             m %+ k, m14, m13
+    mova      [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, srcstrideq
+    dec          heightd
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 10,  8, 1
+HEVC_SAO_BAND_FILTER 10, 16, 2
+HEVC_SAO_BAND_FILTER 10, 32, 4
+HEVC_SAO_BAND_FILTER 10, 48, 6
+HEVC_SAO_BAND_FILTER 10, 64, 8
+
+HEVC_SAO_BAND_FILTER 12,  8, 1
+HEVC_SAO_BAND_FILTER 12, 16, 2
+HEVC_SAO_BAND_FILTER 12, 32, 4
+HEVC_SAO_BAND_FILTER 12, 48, 6
+HEVC_SAO_BAND_FILTER 12, 64, 8
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 3
+HEVC_SAO_BAND_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 3
+HEVC_SAO_BAND_FILTER 12, 64, 4
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+    pminuw            %1, %2, %3
+%else
+    psubusw           %4, %2, %3
+    psubw             %1, %2, %4
+%endif
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE >> 1
+    imul       b_strideq, EDGE_SRCSTRIDE >> 1
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                                   int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if mmsize > 16
+    SPLATW            m8, [offsetq+2]
+    SPLATW            m9, [offsetq+4]
+    SPLATW           m10, [offsetq+0]
+    SPLATW           m11, [offsetq+6]
+    SPLATW           m12, [offsetq+8]
+%else
+    movq             m10, [offsetq+0]
+    movd             m12, [offsetq+6]
+    SPLATW            m8, xm10, 1
+    SPLATW            m9, xm10, 2
+    SPLATW           m10, xm10, 0
+    SPLATW           m11, xm12, 0
+    SPLATW           m12, xm12, 1
+%endif
+    pxor              m0, m0
+%if ARCH_X86_64
+    mova             m13, [pw_m1]
+    mova             m14, [pw_1]
+    mova             m15, [pw_2]
+%else
+    mov          heightd, r6m
+    mova  [rsp+mmsize*0], m8
+    mova  [rsp+mmsize*1], m9
+    mova  [rsp+mmsize*2], m10
+    mova  [rsp+mmsize*3], m11
+    mova  [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%assign i 0
+%rep %3
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    PMINUW            m4, m1, m2, m6
+    PMINUW            m5, m1, m3, m7
+    pcmpeqw           m2, m4
+    pcmpeqw           m3, m5
+    pcmpeqw           m4, m1
+    pcmpeqw           m5, m1
+    psubw             m4, m2
+    psubw             m5, m3
+
+    paddw             m4, m5
+    pcmpeqw           m2, m4, [pw_m2]
+%if ARCH_X86_64
+    pcmpeqw           m3, m4, m13
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, m14
+    pcmpeqw           m7, m4, m15
+    pand              m2, m8
+    pand              m3, m9
+    pand              m5, m10
+    pand              m6, m11
+    pand              m7, m12
+%else
+    pcmpeqw           m3, m4, [pw_m1]
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, [pw_1]
+    pcmpeqw           m7, m4, [pw_2]
+    pand              m2, [rsp+mmsize*0]
+    pand              m3, [rsp+mmsize*1]
+    pand              m5, [rsp+mmsize*2]
+    pand              m6, [rsp+mmsize*3]
+    pand              m7, [rsp+mmsize*4]
+%endif
+    paddw             m2, m3
+    paddw             m5, m6
+    paddw             m2, m7
+    paddw             m2, m1
+    paddw             m2, m5
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova      [dstq + i], m2
+%assign i i+mmsize
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+HEVC_SAO_EDGE_FILTER 10, 16, 2
+HEVC_SAO_EDGE_FILTER 10, 32, 4
+HEVC_SAO_EDGE_FILTER 10, 48, 6
+HEVC_SAO_EDGE_FILTER 10, 64, 8
+
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+HEVC_SAO_EDGE_FILTER 12, 16, 2
+HEVC_SAO_EDGE_FILTER 12, 32, 4
+HEVC_SAO_EDGE_FILTER 12, 48, 6
+HEVC_SAO_EDGE_FILTER 12, 64, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 32, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 3
+HEVC_SAO_EDGE_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 12, 16, 1
+HEVC_SAO_EDGE_FILTER 12, 32, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 3
+HEVC_SAO_EDGE_FILTER 12, 64, 4
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
new file mode 100644
index 0000000000..67be0a9059
--- /dev/null
+++ b/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,259 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index a95fa30a95..17cd2332aa 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -2,29 +2,31 @@
  * Copyright (c) 2013 Seppo Tomperi
  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-
+#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
 #include "libavcodec/hevcdsp.h"
+#include "libavcodec/x86/hevcdsp.h"
 
 #define LFC_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
@@ -32,43 +34,34 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix,
 #define LFL_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
 
-#define LFC_FUNCS(type, depth) \
-    LFC_FUNC(h, depth, sse2)   \
-    LFC_FUNC(v, depth, sse2)
-
-#define LFL_FUNCS(type, depth) \
-    LFL_FUNC(h, depth, ssse3)  \
-    LFL_FUNC(v, depth, ssse3)
-
-LFC_FUNCS(uint8_t, 8)
-LFC_FUNCS(uint8_t, 10)
-LFL_FUNCS(uint8_t, 8)
-LFL_FUNCS(uint8_t, 10)
-
-#define idct_dc_proto(size, bitd, opt) \
-                void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-
-idct_dc_proto(4, 8,mmxext);
-idct_dc_proto(8, 8,mmxext);
-idct_dc_proto(16,8,  sse2);
-idct_dc_proto(32,8,  sse2);
-
-idct_dc_proto(32,8,  avx2);
+#define LFC_FUNCS(type, depth, opt) \
+    LFC_FUNC(h, depth, opt)  \
+    LFC_FUNC(v, depth, opt)
 
-idct_dc_proto(4, 10,mmxext);
-idct_dc_proto(8, 10,  sse2);
-idct_dc_proto(16,10,  sse2);
-idct_dc_proto(32,10,  sse2);
-idct_dc_proto(8, 10,   avx);
-idct_dc_proto(16,10,   avx);
-idct_dc_proto(32,10,   avx);
+#define LFL_FUNCS(type, depth, opt) \
+    LFL_FUNC(h, depth, opt)  \
+    LFL_FUNC(v, depth, opt)
 
-idct_dc_proto(16,10,  avx2);
-idct_dc_proto(32,10,  avx2);
+LFC_FUNCS(uint8_t,   8, sse2)
+LFC_FUNCS(uint8_t,  10, sse2)
+LFC_FUNCS(uint8_t,  12, sse2)
+LFC_FUNCS(uint8_t,   8, avx)
+LFC_FUNCS(uint8_t,  10, avx)
+LFC_FUNCS(uint8_t,  12, avx)
+LFL_FUNCS(uint8_t,   8, sse2)
+LFL_FUNCS(uint8_t,  10, sse2)
+LFL_FUNCS(uint8_t,  12, sse2)
+LFL_FUNCS(uint8_t,   8, ssse3)
+LFL_FUNCS(uint8_t,  10, ssse3)
+LFL_FUNCS(uint8_t,  12, ssse3)
+LFL_FUNCS(uint8_t,   8, avx)
+LFL_FUNCS(uint8_t,  10, avx)
+LFL_FUNCS(uint8_t,  12, avx)
 
 #define IDCT_DC_FUNCS(W, opt) \
 void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
-void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
+void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
+void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
 
 IDCT_DC_FUNCS(4x4,   mmxext);
 IDCT_DC_FUNCS(8x8,   mmxext);
@@ -91,208 +84,631 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
 IDCT_FUNCS(sse2)
 IDCT_FUNCS(avx)
 
-void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-#define GET_PIXELS(width, depth, cf)                                                                      \
-void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride,             \
-                                                           uint8_t *src, ptrdiff_t srcstride,             \
-                                                           int height, int mx, int my, int16_t *mcbuffer);
-
-GET_PIXELS(4,  8, sse2)
-GET_PIXELS(8,  8, sse2)
-GET_PIXELS(12, 8, sse2)
-GET_PIXELS(16, 8, sse2)
-GET_PIXELS(24, 8, sse2)
-GET_PIXELS(32, 8, sse2)
-GET_PIXELS(48, 8, sse2)
-GET_PIXELS(64, 8, sse2)
-
-GET_PIXELS(4,  10, sse2)
-GET_PIXELS(8,  10, sse2)
-GET_PIXELS(12, 10, sse2)
-GET_PIXELS(16, 10, sse2)
-GET_PIXELS(24, 10, sse2)
-GET_PIXELS(32, 10, sse2)
-GET_PIXELS(48, 10, sse2)
-GET_PIXELS(64, 10, sse2)
-
-/* those are independent of the bit depth, so declared separately */
-#define INTERP_HV_FUNC(width, cf)                                                         \
-void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
-                                          int16_t *src, ptrdiff_t srcstride,              \
-                                          int height, int mx, int my, int16_t *mcbuffer); \
-void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
-                                          int16_t *src, ptrdiff_t srcstride,              \
-                                          int height, int mx, int my, int16_t *mcbuffer);
-
-INTERP_HV_FUNC(4,  avx)
-INTERP_HV_FUNC(8,  avx)
-INTERP_HV_FUNC(12, avx)
-INTERP_HV_FUNC(16, avx)
-INTERP_HV_FUNC(24, avx)
-INTERP_HV_FUNC(32, avx)
-INTERP_HV_FUNC(48, avx)
-INTERP_HV_FUNC(64, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
-static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
-                                                               uint8_t *src, ptrdiff_t srcstride,             \
-                                                               int height, int mx, int my, int16_t *mcbuffer) \
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
+                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
+                                                intptr_t mx, intptr_t my, int width)                            \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    int16_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst = _dst + i;                                                                                         \
+        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
+    }                                                                                                           \
+}
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
+                                                    intptr_t mx, intptr_t my, int width)                        \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    uint8_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src = _src + (i * ((bitd + 7) / 8));                                                                    \
+        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
+        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+#define mc_rep_bi_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
+                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
+                                                   int height, intptr_t mx, intptr_t my, int width)             \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t  *src;                                                                                              \
+    uint8_t  *dst;                                                                                              \
+    int16_t  *src2;                                                                                             \
+    for (i = 0; i < W ; i += step) {                                                                            \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
+        src2 = _src2 + i;                                                                                       \
+        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt)            \
+    mc_rep_uni_func(name, bitd, step, W, opt)        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
+                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
+                                                 intptr_t mx, intptr_t my, int width)                           \
+{                                                                                                               \
+    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
+    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
+                                                    _srcstride, height, mx, my, width);                         \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
+                                                     intptr_t mx, intptr_t my, int width)                       \
+{                                                                                                               \
+    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
+    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
+                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
+                                                        height, mx, my, width);                                 \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
+                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
+                                                    int height, intptr_t mx, intptr_t my, int width)            \
+{                                                                                                               \
+    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
+    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
+                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
+                                                       src2 + step1, height, mx, my, width);                    \
+}
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+    mc_rep_func2(name, bitd, step1, step2, W, opt)      \
+    mc_rep_uni_func2(name, bitd, step1, step2, W, opt)  \
+    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
+                                                 int height, intptr_t mx, intptr_t my, int width)             \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
+    ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                    \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,          \
+                                                    ptrdiff_t _srcstride, int16_t *src2,                      \
+                                                    int height, intptr_t mx, intptr_t my, int width)          \
 {                                                                                                             \
-    const ptrdiff_t stride = FFALIGN(width + 7, 8);                                                           \
-    ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
-                                                        height + 7, mx, my, mcbuffer);                        \
-    ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride,                \
-                                            height, mx, my, mcbuffer);                                        \
+    ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                     \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
+                                                   height, mx, my, width);                                    \
 }
-#else
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
-void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-QPEL_FUNCS(4,  8, ssse3, ssse3, avx)
-QPEL_FUNCS(8,  8, ssse3, ssse3, avx)
-QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
-
-QPEL_FUNCS(4,  10, avx, avx, avx)
-QPEL_FUNCS(8,  10, avx, avx, avx)
-QPEL_FUNCS(12, 10, avx, avx, avx)
-QPEL_FUNCS(16, 10, avx, avx, avx)
-QPEL_FUNCS(24, 10, avx, avx, avx)
-QPEL_FUNCS(32, 10, avx, avx, avx)
-QPEL_FUNCS(48, 10, avx, avx, avx)
-QPEL_FUNCS(64, 10, avx, avx, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
-static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
-                                                               uint8_t *src, ptrdiff_t srcstride,             \
-                                                               int height, int mx, int my, int16_t *mcbuffer) \
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                   \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                       \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,          \
+                                                     intptr_t mx, intptr_t my, int width)                     \
 {                                                                                                             \
-    const ptrdiff_t stride = FFALIGN(width + 3, 8);                                                           \
-    ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride,     \
-                                                        height + 3, mx, my, mcbuffer);                        \
-    ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride,                    \
-                                            height, mx, my, mcbuffer);                                        \
+    ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                          \
+                                                      height, mx, my, width);                                 \
+    ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,            \
+                                                      height, mx, my, width);                                 \
 }
-#else
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
-void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-EPEL_FUNCS(4,  8, ssse3, ssse3, avx)
-EPEL_FUNCS(8,  8, ssse3, ssse3, avx)
-EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-
-EPEL_FUNCS(4,  10, avx, avx, avx)
-EPEL_FUNCS(8,  10, avx, avx, avx)
-EPEL_FUNCS(12, 10, avx, avx, avx)
-EPEL_FUNCS(16, 10, avx, avx, avx)
-EPEL_FUNCS(24, 10, avx, avx, avx)
-EPEL_FUNCS(32, 10, avx, avx, avx)
-
-#define PUT_PRED(width, depth, cf_uw, cf_w) \
-void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,                   \
-                                                                       int16_t *src, ptrdiff_t srcstride,                   \
-                                                                       int height);                                         \
-void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,               \
-                                                                           int16_t *src1, int16_t *src2,                    \
-                                                                           ptrdiff_t srcstride, int height);                \
-void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset,          \
-                                                                    uint8_t *dst, ptrdiff_t dststride,                      \
-                                                                    int16_t *src, ptrdiff_t srcstride,                      \
-                                                                    int height);                                            \
-void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1,    \
-                                                                        int16_t offset0, int16_t offset1,                   \
-                                                                        uint8_t *dst, ptrdiff_t dststride,                  \
-                                                                        int16_t *src0, int16_t *src1, ptrdiff_t srcstride,  \
-                                                                        int height);
-
-PUT_PRED(4,  8, sse2, sse4)
-PUT_PRED(8,  8, sse2, sse4)
-PUT_PRED(12, 8, sse2, sse4)
-PUT_PRED(16, 8, sse2, sse4)
-PUT_PRED(24, 8, sse2, sse4)
-PUT_PRED(32, 8, sse2, sse4)
-PUT_PRED(48, 8, sse2, sse4)
-PUT_PRED(64, 8, sse2, sse4)
-
-PUT_PRED(4,  10, sse2, sse4)
-PUT_PRED(8,  10, sse2, sse4)
-PUT_PRED(12, 10, sse2, sse4)
-PUT_PRED(16, 10, sse2, sse4)
-PUT_PRED(24, 10, sse2, sse4)
-PUT_PRED(32, 10, sse2, sse4)
-PUT_PRED(48, 10, sse2, sse4)
-PUT_PRED(64, 10, sse2, sse4)
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)   \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)            \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)         \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,             \
+                                                int height, intptr_t mx, intptr_t my, int width)              \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
+    ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
+}
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                             \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,           \
+                                                   ptrdiff_t _srcstride, int16_t* src2,                       \
+                                                   int height, intptr_t mx, intptr_t my, int width)           \
+{                                                                                                             \
+    ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                            \
+                                                  src2, height, mx, my, width);                               \
+    ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,              \
+                                                  src2+width2, height, mx, my, width);                        \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                            \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *src, ptrdiff_t _srcstride, int height,           \
+                                                    intptr_t mx, intptr_t my, int width)                      \
+{                                                                                                             \
+    ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                           \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,             \
+                                                   height, mx, my, width);                                    \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)   \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)            \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)         \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4)
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2)
+mc_rep_funcs(epel_h, 10, 16, 48, avx2)
+mc_rep_funcs(epel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2)
+mc_rep_funcs(epel_v, 10, 16, 48, avx2)
+mc_rep_funcs(epel_v, 10, 32, 64, avx2)
+
+
+mc_rep_funcs(epel_hv,  8, 32, 64, avx2)
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
+
+#endif //AVX2
+
+mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
+mc_rep_funcs(pel_pixels, 8,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 64, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 48, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 32, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 16, sse4)
+mc_rep_funcs(pel_pixels,10,  4, 12, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 64, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 48, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 32, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 16, sse4)
+mc_rep_funcs(pel_pixels,12,  4, 12, sse4)
+
+mc_rep_funcs(epel_h, 8, 16, 64, sse4)
+mc_rep_funcs(epel_h, 8, 16, 48, sse4)
+mc_rep_funcs(epel_h, 8, 16, 32, sse4)
+mc_rep_funcs(epel_h, 8,  8, 24, sse4)
+mc_rep_funcs(epel_h,10,  8, 64, sse4)
+mc_rep_funcs(epel_h,10,  8, 48, sse4)
+mc_rep_funcs(epel_h,10,  8, 32, sse4)
+mc_rep_funcs(epel_h,10,  8, 24, sse4)
+mc_rep_funcs(epel_h,10,  8, 16, sse4)
+mc_rep_funcs(epel_h,10,  4, 12, sse4)
+mc_rep_funcs(epel_h,12,  8, 64, sse4)
+mc_rep_funcs(epel_h,12,  8, 48, sse4)
+mc_rep_funcs(epel_h,12,  8, 32, sse4)
+mc_rep_funcs(epel_h,12,  8, 24, sse4)
+mc_rep_funcs(epel_h,12,  8, 16, sse4)
+mc_rep_funcs(epel_h,12,  4, 12, sse4)
+mc_rep_funcs(epel_v, 8, 16, 64, sse4)
+mc_rep_funcs(epel_v, 8, 16, 48, sse4)
+mc_rep_funcs(epel_v, 8, 16, 32, sse4)
+mc_rep_funcs(epel_v, 8,  8, 24, sse4)
+mc_rep_funcs(epel_v,10,  8, 64, sse4)
+mc_rep_funcs(epel_v,10,  8, 48, sse4)
+mc_rep_funcs(epel_v,10,  8, 32, sse4)
+mc_rep_funcs(epel_v,10,  8, 24, sse4)
+mc_rep_funcs(epel_v,10,  8, 16, sse4)
+mc_rep_funcs(epel_v,10,  4, 12, sse4)
+mc_rep_funcs(epel_v,12,  8, 64, sse4)
+mc_rep_funcs(epel_v,12,  8, 48, sse4)
+mc_rep_funcs(epel_v,12,  8, 32, sse4)
+mc_rep_funcs(epel_v,12,  8, 24, sse4)
+mc_rep_funcs(epel_v,12,  8, 16, sse4)
+mc_rep_funcs(epel_v,12,  4, 12, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
+mc_rep_funcs(epel_hv, 8,  8, 24, sse4)
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4)
+mc_rep_funcs(epel_hv,10,  8, 64, sse4)
+mc_rep_funcs(epel_hv,10,  8, 48, sse4)
+mc_rep_funcs(epel_hv,10,  8, 32, sse4)
+mc_rep_funcs(epel_hv,10,  8, 24, sse4)
+mc_rep_funcs(epel_hv,10,  8, 16, sse4)
+mc_rep_funcs(epel_hv,10,  4, 12, sse4)
+mc_rep_funcs(epel_hv,12,  8, 64, sse4)
+mc_rep_funcs(epel_hv,12,  8, 48, sse4)
+mc_rep_funcs(epel_hv,12,  8, 32, sse4)
+mc_rep_funcs(epel_hv,12,  8, 24, sse4)
+mc_rep_funcs(epel_hv,12,  8, 16, sse4)
+mc_rep_funcs(epel_hv,12,  4, 12, sse4)
+
+mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_h, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_h,10,  8, 64, sse4)
+mc_rep_funcs(qpel_h,10,  8, 48, sse4)
+mc_rep_funcs(qpel_h,10,  8, 32, sse4)
+mc_rep_funcs(qpel_h,10,  8, 24, sse4)
+mc_rep_funcs(qpel_h,10,  8, 16, sse4)
+mc_rep_funcs(qpel_h,10,  4, 12, sse4)
+mc_rep_funcs(qpel_h,12,  8, 64, sse4)
+mc_rep_funcs(qpel_h,12,  8, 48, sse4)
+mc_rep_funcs(qpel_h,12,  8, 32, sse4)
+mc_rep_funcs(qpel_h,12,  8, 24, sse4)
+mc_rep_funcs(qpel_h,12,  8, 16, sse4)
+mc_rep_funcs(qpel_h,12,  4, 12, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_v, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_v,10,  8, 64, sse4)
+mc_rep_funcs(qpel_v,10,  8, 48, sse4)
+mc_rep_funcs(qpel_v,10,  8, 32, sse4)
+mc_rep_funcs(qpel_v,10,  8, 24, sse4)
+mc_rep_funcs(qpel_v,10,  8, 16, sse4)
+mc_rep_funcs(qpel_v,10,  4, 12, sse4)
+mc_rep_funcs(qpel_v,12,  8, 64, sse4)
+mc_rep_funcs(qpel_v,12,  8, 48, sse4)
+mc_rep_funcs(qpel_v,12,  8, 32, sse4)
+mc_rep_funcs(qpel_v,12,  8, 24, sse4)
+mc_rep_funcs(qpel_v,12,  8, 16, sse4)
+mc_rep_funcs(qpel_v,12,  4, 12, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 64, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 48, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 32, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 16, sse4)
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 64, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 48, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 32, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 24, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 16, sse4)
+mc_rep_funcs(qpel_hv,10,  4, 12, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 64, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 48, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 32, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 24, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 16, sse4)
+mc_rep_funcs(qpel_hv,12,  4, 12, sse4)
+
+#define mc_rep_uni_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                               int height, int denom,  int _wx, int _ox)                                \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src= _src + i;                                                                                                  \
+        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src,                                   \
+                                                     height, denom, _wx, _ox);                                          \
+    }                                                                                                                   \
+}
+
+mc_rep_uni_w(8, 6, 12, sse4)
+mc_rep_uni_w(8, 8, 16, sse4)
+mc_rep_uni_w(8, 8, 24, sse4)
+mc_rep_uni_w(8, 8, 32, sse4)
+mc_rep_uni_w(8, 8, 48, sse4)
+mc_rep_uni_w(8, 8, 64, sse4)
+
+mc_rep_uni_w(10, 6, 12, sse4)
+mc_rep_uni_w(10, 8, 16, sse4)
+mc_rep_uni_w(10, 8, 24, sse4)
+mc_rep_uni_w(10, 8, 32, sse4)
+mc_rep_uni_w(10, 8, 48, sse4)
+mc_rep_uni_w(10, 8, 64, sse4)
+
+mc_rep_uni_w(12, 6, 12, sse4)
+mc_rep_uni_w(12, 8, 16, sse4)
+mc_rep_uni_w(12, 8, 24, sse4)
+mc_rep_uni_w(12, 8, 32, sse4)
+mc_rep_uni_w(12, 8, 48, sse4)
+mc_rep_uni_w(12, 8, 64, sse4)
+
+#define mc_rep_bi_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                              int16_t *_src2, int height,                                               \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    int16_t *src2;                                                                                                      \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src  = _src  + i;                                                                                               \
+        src2 = _src2 + i;                                                                                               \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2,                             \
+                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                             \
+    }                                                                                                                   \
+}
+
+mc_rep_bi_w(8, 6, 12, sse4)
+mc_rep_bi_w(8, 8, 16, sse4)
+mc_rep_bi_w(8, 8, 24, sse4)
+mc_rep_bi_w(8, 8, 32, sse4)
+mc_rep_bi_w(8, 8, 48, sse4)
+mc_rep_bi_w(8, 8, 64, sse4)
+
+mc_rep_bi_w(10, 6, 12, sse4)
+mc_rep_bi_w(10, 8, 16, sse4)
+mc_rep_bi_w(10, 8, 24, sse4)
+mc_rep_bi_w(10, 8, 32, sse4)
+mc_rep_bi_w(10, 8, 48, sse4)
+mc_rep_bi_w(10, 8, 64, sse4)
+
+mc_rep_bi_w(12, 6, 12, sse4)
+mc_rep_bi_w(12, 8, 16, sse4)
+mc_rep_bi_w(12, 8, 24, sse4)
+mc_rep_bi_w(12, 8, 32, sse4)
+mc_rep_bi_w(12, 8, 48, sse4)
+mc_rep_bi_w(12, 8, 64, sse4)
+
+#define mc_uni_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
+                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
+                                                      int height, int denom,                        \
+                                                      int _wx, int _ox,                             \
+                                                      intptr_t mx, intptr_t my, int width)          \
+{                                                                                                   \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
+}
+
+#define mc_uni_w_funcs(name, bitd, opt)      \
+        mc_uni_w_func(name, bitd, 4, opt)    \
+        mc_uni_w_func(name, bitd, 8, opt)    \
+        mc_uni_w_func(name, bitd, 12, opt)   \
+        mc_uni_w_func(name, bitd, 16, opt)   \
+        mc_uni_w_func(name, bitd, 24, opt)   \
+        mc_uni_w_func(name, bitd, 32, opt)   \
+        mc_uni_w_func(name, bitd, 48, opt)   \
+        mc_uni_w_func(name, bitd, 64, opt)
+
+mc_uni_w_funcs(pel_pixels, 8, sse4)
+mc_uni_w_func(pel_pixels, 8, 6, sse4)
+mc_uni_w_funcs(epel_h, 8, sse4)
+mc_uni_w_func(epel_h, 8, 6, sse4)
+mc_uni_w_funcs(epel_v, 8, sse4)
+mc_uni_w_func(epel_v, 8, 6, sse4)
+mc_uni_w_funcs(epel_hv, 8, sse4)
+mc_uni_w_func(epel_hv, 8, 6, sse4)
+mc_uni_w_funcs(qpel_h, 8, sse4)
+mc_uni_w_funcs(qpel_v, 8, sse4)
+mc_uni_w_funcs(qpel_hv, 8, sse4)
+
+mc_uni_w_funcs(pel_pixels, 10, sse4)
+mc_uni_w_func(pel_pixels, 10, 6, sse4)
+mc_uni_w_funcs(epel_h, 10, sse4)
+mc_uni_w_func(epel_h, 10, 6, sse4)
+mc_uni_w_funcs(epel_v, 10, sse4)
+mc_uni_w_func(epel_v, 10, 6, sse4)
+mc_uni_w_funcs(epel_hv, 10, sse4)
+mc_uni_w_func(epel_hv, 10, 6, sse4)
+mc_uni_w_funcs(qpel_h, 10, sse4)
+mc_uni_w_funcs(qpel_v, 10, sse4)
+mc_uni_w_funcs(qpel_hv, 10, sse4)
+
+mc_uni_w_funcs(pel_pixels, 12, sse4)
+mc_uni_w_func(pel_pixels, 12, 6, sse4)
+mc_uni_w_funcs(epel_h, 12, sse4)
+mc_uni_w_func(epel_h, 12, 6, sse4)
+mc_uni_w_funcs(epel_v, 12, sse4)
+mc_uni_w_func(epel_v, 12, 6, sse4)
+mc_uni_w_funcs(epel_hv, 12, sse4)
+mc_uni_w_func(epel_hv, 12, 6, sse4)
+mc_uni_w_funcs(qpel_h, 12, sse4)
+mc_uni_w_funcs(qpel_v, 12, sse4)
+mc_uni_w_funcs(qpel_hv, 12, sse4)
+
+#define mc_bi_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
+                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
+                                                     int16_t *_src2,                                 \
+                                                     int height, int denom,                          \
+                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, int width)            \
+{                                                                                                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2,                         \
+                                              height, denom, _wx0, _wx1, _ox0, _ox1);                \
+}
+
+#define mc_bi_w_funcs(name, bitd, opt)      \
+        mc_bi_w_func(name, bitd, 4, opt)    \
+        mc_bi_w_func(name, bitd, 8, opt)    \
+        mc_bi_w_func(name, bitd, 12, opt)   \
+        mc_bi_w_func(name, bitd, 16, opt)   \
+        mc_bi_w_func(name, bitd, 24, opt)   \
+        mc_bi_w_func(name, bitd, 32, opt)   \
+        mc_bi_w_func(name, bitd, 48, opt)   \
+        mc_bi_w_func(name, bitd, 64, opt)
+
+mc_bi_w_funcs(pel_pixels, 8, sse4)
+mc_bi_w_func(pel_pixels, 8, 6, sse4)
+mc_bi_w_funcs(epel_h, 8, sse4)
+mc_bi_w_func(epel_h, 8, 6, sse4)
+mc_bi_w_funcs(epel_v, 8, sse4)
+mc_bi_w_func(epel_v, 8, 6, sse4)
+mc_bi_w_funcs(epel_hv, 8, sse4)
+mc_bi_w_func(epel_hv, 8, 6, sse4)
+mc_bi_w_funcs(qpel_h, 8, sse4)
+mc_bi_w_funcs(qpel_v, 8, sse4)
+mc_bi_w_funcs(qpel_hv, 8, sse4)
+
+mc_bi_w_funcs(pel_pixels, 10, sse4)
+mc_bi_w_func(pel_pixels, 10, 6, sse4)
+mc_bi_w_funcs(epel_h, 10, sse4)
+mc_bi_w_func(epel_h, 10, 6, sse4)
+mc_bi_w_funcs(epel_v, 10, sse4)
+mc_bi_w_func(epel_v, 10, 6, sse4)
+mc_bi_w_funcs(epel_hv, 10, sse4)
+mc_bi_w_func(epel_hv, 10, 6, sse4)
+mc_bi_w_funcs(qpel_h, 10, sse4)
+mc_bi_w_funcs(qpel_v, 10, sse4)
+mc_bi_w_funcs(qpel_hv, 10, sse4)
+
+mc_bi_w_funcs(pel_pixels, 12, sse4)
+mc_bi_w_func(pel_pixels, 12, 6, sse4)
+mc_bi_w_funcs(epel_h, 12, sse4)
+mc_bi_w_func(epel_h, 12, 6, sse4)
+mc_bi_w_funcs(epel_v, 12, sse4)
+mc_bi_w_func(epel_v, 12, 6, sse4)
+mc_bi_w_funcs(epel_hv, 12, sse4)
+mc_bi_w_func(epel_hv, 12, 6, sse4)
+mc_bi_w_funcs(qpel_h, 12, sse4)
+mc_bi_w_funcs(qpel_v, 12, sse4)
+mc_bi_w_funcs(qpel_hv, 12, sse4)
+#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
+void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+SAO_BAND_FILTER_FUNCS(8,  sse2)
+SAO_BAND_FILTER_FUNCS(10, sse2)
+SAO_BAND_FILTER_FUNCS(12, sse2)
+SAO_BAND_FILTER_FUNCS(8,   avx)
+SAO_BAND_FILTER_FUNCS(10,  avx)
+SAO_BAND_FILTER_FUNCS(12,  avx)
+SAO_BAND_FILTER_FUNCS(8,  avx2)
+SAO_BAND_FILTER_FUNCS(10, avx2)
+SAO_BAND_FILTER_FUNCS(12, avx2)
+
+#define SAO_BAND_INIT(bitd, opt) do {                                       \
+    c->sao_band_filter[0]      = ff_hevc_sao_band_filter_8_##bitd##_##opt;  \
+    c->sao_band_filter[1]      = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
+    c->sao_band_filter[2]      = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
+    c->sao_band_filter[3]      = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
+    c->sao_band_filter[4]      = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define SAO_EDGE_FILTER_FUNCS(bitd, opt)                                                                                    \
+void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,  \
+                                              int eo, int width, int height);                                               \
+void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+
+SAO_EDGE_FILTER_FUNCS(8, ssse3)
+SAO_EDGE_FILTER_FUNCS(8, avx2)
+SAO_EDGE_FILTER_FUNCS(10, sse2)
+SAO_EDGE_FILTER_FUNCS(10, avx2)
+SAO_EDGE_FILTER_FUNCS(12, sse2)
+SAO_EDGE_FILTER_FUNCS(12, avx2)
+
+#define SAO_EDGE_INIT(bitd, opt) do {                                       \
+    c->sao_edge_filter[0]      = ff_hevc_sao_edge_filter_8_##bitd##_##opt;  \
+    c->sao_edge_filter[1]      = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
+    c->sao_edge_filter[2]      = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
+    c->sao_edge_filter[3]      = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
+    c->sao_edge_filter[4]      = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
+#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#define SET_LUMA_FUNCS(tabname, funcname, depth, cf)      \
-    c->tabname[0] = funcname ## _4_  ## depth ## _ ## cf; \
-    c->tabname[1] = funcname ## _8_  ## depth ## _ ## cf; \
-    c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
-    c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
-    c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
-    c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
-    c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
-    c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
-
-#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf)    \
-    c->tabname[1] = funcname ## _4_  ## depth ## _ ## cf; \
-    c->tabname[3] = funcname ## _8_  ## depth ## _ ## cf; \
-    c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
-    c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
-    c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
-    c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
-
-#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS  (put_hevc_qpel[v][h], name, depth, cf)
-#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
-
     if (bit_depth == 8) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
@@ -303,10 +719,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 
-            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
-            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
-            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
+                c->idct[2] = ff_hevc_idct_16x16_8_sse2;
+                c->idct[3] = ff_hevc_idct_32x32_8_sse2;
+            }
+            SAO_BAND_INIT(8, sse2);
 
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
@@ -315,41 +735,166 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
             c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
 
-            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-
-            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
-            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
+            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
+            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
+            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
-            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
-            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
-            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
-            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+            if(ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+            }
+            SAO_EDGE_INIT(8, ssse3);
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+
+                c->idct[2] = ff_hevc_idct_16x16_8_avx;
+                c->idct[3] = ff_hevc_idct_32x32_8_avx;
+            }
+            SAO_BAND_INIT(8, avx);
+
             c->idct[0] = ff_hevc_idct_4x4_8_avx;
             c->idct[1] = ff_hevc_idct_8x8_8_avx;
+
             c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
             c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
+            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+            }
+            SAO_BAND_INIT(8, avx2);
+
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
             c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
-
-            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+
+                c->idct[2] = ff_hevc_idct_16x16_10_sse2;
+                c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+            }
+            SAO_BAND_INIT(10, sse2);
+            SAO_EDGE_INIT(10, sse2);
 
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
@@ -357,89 +902,250 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
             c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
             c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
-            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-
-            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
-            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
 
             c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
             c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
             c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
         }
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+        }
         if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+
+                c->idct[2] = ff_hevc_idct_16x16_10_avx;
+                c->idct[3] = ff_hevc_idct_32x32_10_avx;
+            }
+
             c->idct[0] = ff_hevc_idct_4x4_10_avx;
             c->idct[1] = ff_hevc_idct_8x8_10_avx;
+
+            SAO_BAND_INIT(10, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
-            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
-            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
         }
-    }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 
-#if ARCH_X86_64
-    if (bit_depth == 8) {
-        if (EXTERNAL_SSE2(cpu_flags)) {
-            c->idct[2] = ff_hevc_idct_16x16_8_sse2;
-            c->idct[3] = ff_hevc_idct_32x32_8_sse2;
-        }
-        if (EXTERNAL_SSSE3(cpu_flags)) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
-        }
+                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 
-        if (EXTERNAL_SSE4(cpu_flags)) {
-            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
-            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
-        }
+                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 
-        if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
-            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
-            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
-            c->idct[2] = ff_hevc_idct_16x16_8_avx;
-            c->idct[3] = ff_hevc_idct_32x32_8_avx;
+                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+            }
+            SAO_BAND_INIT(10, avx2);
+            SAO_EDGE_INIT(10, avx2);
+
+            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
+            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
         }
-        if (EXTERNAL_AVX2(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
-            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+    } else if (bit_depth == 12) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
         }
-    } else if (bit_depth == 10) {
         if (EXTERNAL_SSE2(cpu_flags)) {
-            c->idct[2] = ff_hevc_idct_16x16_10_sse2;
-            c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+            }
+            SAO_BAND_INIT(12, sse2);
+            SAO_EDGE_INIT(12, sse2);
+
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
         }
-        if (EXTERNAL_SSSE3(cpu_flags)) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
         }
-        if (EXTERNAL_SSE4(cpu_flags)) {
-            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
-            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
-            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
-            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
-            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
-            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
-            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
-            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
-            c->idct[2] = ff_hevc_idct_16x16_10_avx;
-            c->idct[3] = ff_hevc_idct_32x32_10_avx;
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+            }
+            SAO_BAND_INIT(12, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
-            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
+
+            SAO_BAND_INIT(12, avx2);
+            SAO_EDGE_INIT(12, avx2);
         }
     }
-#endif /* ARCH_X86_64 */
 }
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 8e211140ca..ce5d7a4e28 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
 ;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
 ;* SIMD-optimized halfpel functions
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -22,26 +29,49 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
 
 SECTION .text
 
 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
 cglobal put_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
 .loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m1
     add          r1, r4
     add          r0, r4
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m1
@@ -99,6 +129,9 @@ INIT_MMX mmxext
 PUT_PIXELS_16
 INIT_MMX 3dnow
 PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
 
 
 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -144,20 +177,24 @@ PUT_NO_RND_PIXELS8_X2
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
 cglobal put_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     add          r0, r4
     add          r1, r4
     PAVGB        m2, m1
@@ -174,6 +211,9 @@ INIT_MMX mmxext
 PUT_PIXELS8_Y2
 INIT_MMX 3dnow
 PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
 
 
 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -245,26 +285,48 @@ AVG_PIXELS8
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
 cglobal avg_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
+%if notcpuflag(mmxext)
+    pcmpeqd      m5, m5
+    paddb        m5, m5
+%endif
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m2
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
     add          r0, r4
     add          r1, r4
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -273,40 +335,45 @@ cglobal avg_pixels8_x2, 4,5
     REP_RET
 %endmacro
 
+INIT_MMX mmx
+AVG_PIXELS8_X2
 INIT_MMX mmxext
 AVG_PIXELS8_X2
 INIT_MMX 3dnow
 AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
 
 
 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
 cglobal avg_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m0, m3
-    PAVGB        m1, m4
+    PAVGB        m0, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     PAVGB        m2, m1
     PAVGB        m1, m0
     add          r0, r4
     add          r1, r4
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m2, m3
-    PAVGB        m1, m4
+    PAVGB        m2, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -319,11 +386,16 @@ INIT_MMX mmxext
 AVG_PIXELS8_Y2
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
     mova         m6, [pb_1]
     lea          r4, [r2*2]
     mova         m0, [r1]
@@ -360,6 +432,160 @@ cglobal avg_pixels8_xy2, 4,5
 %endmacro
 
 INIT_MMX mmxext
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+    pxor        m7, m7
+    mova        m6, [pw_2]
+    movu        m0, [r1]
+    movu        m4, [r1+1]
+    mova        m1, m0
+    mova        m5, m4
+    punpcklbw   m0, m7
+    punpcklbw   m4, m7
+    punpckhbw   m1, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m0
+    paddusw     m5, m1
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m0, [r1+r4]
+    movu        m2, [r1+r4+1]
+    mova        m1, m0
+    mova        m3, m2
+    punpcklbw   m0, m7
+    punpcklbw   m2, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddusw     m0, m2
+    paddusw     m1, m3
+    paddusw     m4, m6
+    paddusw     m5, m6
+    paddusw     m4, m0
+    paddusw     m5, m1
+    psrlw       m4, 2
+    psrlw       m5, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m4, m5
+    PAVGB       m4, m3
+%else
+    packuswb    m4, m5
+%endif
+    mova   [r0+r4], m4
+    add         r4, r2
+
+    movu        m2, [r1+r4]
+    movu        m4, [r1+r4+1]
+    mova        m3, m2
+    mova        m5, m4
+    punpcklbw   m2, m7
+    punpcklbw   m4, m7
+    punpckhbw   m3, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m2
+    paddusw     m5, m3
+    paddusw     m0, m6
+    paddusw     m1, m6
+    paddusw     m0, m4
+    paddusw     m1, m5
+    psrlw       m0, 2
+    psrlw       m1, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m0, m1
+    PAVGB       m0, m3
+%else
+    packuswb    m0, m1
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
 INIT_MMX 3dnow
-AVG_PIXELS8_XY2
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+    mova        m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+    mova        m4, [pb_interleave8]
+%endif
+    mova        m5, [pb_1]
+    movu        m0, [r1]
+    movu        m1, [r1+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m2, [r1+r4]
+    movu        m3, [r1+r4+1]
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+    paddusw     m0, m2
+    paddusw     m1, m3
+    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m1, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m0, m1
+    pshufb      m0, m4
+    pavgb       m0, m6
+%else
+    packuswb    m0, m1
+    pshufb      m0, m4
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+
+    movu        m0, [r1+r4]
+    movu        m1, [r1+r4+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    paddusw     m2, m0
+    paddusw     m3, m1
+    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m3, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m2, m3
+    pshufb      m2, m4
+    pavgb       m2, m6
+%else
+    packuswb    m2, m3
+    pshufb      m2, m4
+%endif
+    mova   [r0+r4], m2
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index 566e518a09..bf97029b57 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,14 +29,29 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
 
 void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
 void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
 void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
-void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags);
+void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
 
 #endif /* AVCODEC_X86_HPELDSP_H */
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 6a8d4205fa..58e27e3542 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -39,6 +39,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -61,10 +69,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 
 #define avg_pixels8_mmx         ff_avg_pixels8_mmx
 #define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
@@ -98,11 +108,13 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
 #undef PAVGB
 #undef STATIC
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
 
 /***********************************/
 /* MMX rounding */
@@ -125,11 +137,13 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #undef PAVGBP
 #undef PAVGB
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
 
 #endif /* HAVE_INLINE_ASM */
 
@@ -143,32 +157,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
     CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8)
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
 
 HPELDSP_AVG_PIXELS16(_3dnow)
 HPELDSP_AVG_PIXELS16(_mmxext)
 
 #endif /* HAVE_YASM */
 
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
     do {                                                                        \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
     } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
-#if HAVE_MMX_INLINE
     SET_HPEL_FUNCS(put,        [0], 16, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
     SET_HPEL_FUNCS(avg,        [0], 16, mmx);
     SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
     SET_HPEL_FUNCS(put,        [1],  8, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
-    SET_HPEL_FUNCS(avg,        [1],  8, mmx);
-#endif /* HAVE_MMX_INLINE */
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
 }
 
 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
@@ -180,6 +211,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -187,6 +219,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -194,8 +227,8 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
@@ -209,6 +242,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -216,6 +250,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)){
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
@@ -223,8 +258,8 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
     }
 #endif /* HAVE_AMD3DNOW_EXTERNAL */
 }
@@ -234,10 +269,26 @@ static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
 #if HAVE_SSE2_EXTERNAL
     c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
     c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+    c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+    c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+    c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
     c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+    c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+    c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+    c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
 av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -254,6 +305,9 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
     if (EXTERNAL_SSE2_FAST(cpu_flags))
         hpeldsp_init_sse2_fast(c, flags);
 
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags);
+
     if (CONFIG_VP3_DECODER)
-        ff_hpeldsp_vp3_init_x86(c, cpu_flags);
+        ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
 }
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
deleted file mode 100644
index c93c78e40e..0000000000
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "hpeldsp.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index 82231ad13d..2bff2d2766 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include <stdint.h>
 
 // put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -60,7 +60,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :FF_REG_a, "memory");
 }
 
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -106,7 +106,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff
         :FF_REG_a, "memory");
 }
 
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -115,14 +115,14 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         ".p2align 3                     \n\t"
         "1:                             \n\t"
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm2 \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
         "movq   %%mm4, (%2)             \n\t"
         "movq   %%mm5, (%2, %3)         \n\t"
         "add    %%"FF_REG_a", %1        \n\t"
         "add    %%"FF_REG_a", %2        \n\t"
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm0 \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
         "movq   %%mm4, (%2)             \n\t"
         "movq   %%mm5, (%2, %3)         \n\t"
@@ -135,33 +135,34 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :FF_REG_a, "memory");
 }
 
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
         __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
             :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
 }
 
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -170,7 +171,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         ".p2align 3                     \n\t"
         "1:                             \n\t"
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
         "movq   (%2), %%mm3             \n\t"
         PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
@@ -182,7 +183,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         "add    %%"FF_REG_a", %2        \n\t"
 
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
         "movq   (%2), %%mm3             \n\t"
         PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
diff --git a/libavcodec/x86/hpeldsp_vp3.asm b/libavcodec/x86/hpeldsp_vp3.asm
index 513f14e4b4..cba96d06cb 100644
--- a/libavcodec/x86/hpeldsp_vp3.asm
+++ b/libavcodec/x86/hpeldsp_vp3.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* SIMD-optimized halfpel functions for VP3
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/hpeldsp_vp3_init.c b/libavcodec/x86/hpeldsp_vp3_init.c
index cc1f5e4566..5979f4123c 100644
--- a/libavcodec/x86/hpeldsp_vp3_init.c
+++ b/libavcodec/x86/hpeldsp_vp3_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,15 +38,19 @@ void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
                                           const uint8_t *pixels,
                                           ptrdiff_t line_size, int h);
 
-av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags)
+av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
 {
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+        }
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+        }
     }
 }
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 692162b5b6..0d8cae354a 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -1,48 +1,155 @@
 ;******************************************************************************
 ;* SIMD-optimized HuffYUV functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-pb_f: times 16 db 15
-pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
-pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
-pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
-
 SECTION .text
 
-; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-;                                     const uint8_t *diff, int w,
-;                                     int *left, int *left_top)
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    m4, maskd
+    SPLATW  m4, m4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    INT16_LOOP a, add
+%endif
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, add
+.unaligned:
+    INT16_LOOP u, add
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    LSHIFT        m0, mmsize-4
+    neg           wq
+.loop:
+    movu          m1, [srcq+wq]
+    mova          m2, m1
+%if mmsize == 8
+    punpckhdq     m0, m0
+%endif
+    LSHIFT        m1, 4
+    paddb         m1, m2
+%if mmsize == 16
+    pshufd        m0, m0, q3333
+    mova          m2, m1
+    LSHIFT        m1, 8
+    paddb         m1, m2
+%endif
+    paddb         m0, m1
+    movu   [dstq+wq], m0
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
 INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
     movq    mm0, [topq]
     movq    mm2, mm0
     movd    mm4, [left_topq]
-    psllq   mm2, 8
+    psllq   mm2, 16
     movq    mm1, mm0
     por     mm4, mm2
     movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
+    psubw   mm0, mm4 ; t-tl
     add    dstq, wq
     add    topq, wq
     add   diffq, wq
@@ -51,115 +158,45 @@ cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
 .loop:
     movq    mm4, [topq+wq]
     movq    mm0, mm4
-    psllq   mm4, 8
+    psllq   mm4, 16
     por     mm4, mm1
     movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
+    psubw   mm0, mm4 ; t-tl
 .skip:
     movq    mm2, [diffq+wq]
 %assign i 0
-%rep 8
+%rep 4
     movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
+    paddw   mm4, mm3 ; t-tl+l
+    pand    mm4, mm6
     movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
 %if i==0
     movq    mm7, mm3
-    psllq   mm7, 56
+    psllq   mm7, 48
 %else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
 %endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
 %endif
 %assign i i+1
 %endrep
     movq [dstq+wq], mm7
     add      wq, 8
     jl .loop
-    movzx   r2d, byte [dstq-1]
+    movzx   r2d, word [dstq-2]
     mov [leftq], r2d
-    movzx   r2d, byte [topq-1]
+    movzx   r2d, word [topq-2]
     mov [left_topq], r2d
     RET
-
-
-%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
-    add     srcq, wq
-    add     dstq, wq
-    neg     wq
-%%.loop:
-%if %2
-    mova    m1, [srcq+wq]
-%else
-    movu    m1, [srcq+wq]
-%endif
-    mova    m2, m1
-    psllw   m1, 8
-    paddb   m1, m2
-    mova    m2, m1
-    pshufb  m1, m3
-    paddb   m1, m2
-    pshufb  m0, m5
-    mova    m2, m1
-    pshufb  m1, m4
-    paddb   m1, m2
-%if mmsize == 16
-    mova    m2, m1
-    pshufb  m1, m6
-    paddb   m1, m2
-%endif
-    paddb   m0, m1
-%if %1
-    mova    [dstq+wq], m0
-%else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
-%endif
-    add     wq, mmsize
-    jl %%.loop
-    mov     eax, mmsize-1
-    sub     eax, wd
-    movd    m1, eax
-    pshufb  m0, m1
-    movd    eax, m0
-    RET
-%endmacro
-
-; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
-INIT_MMX ssse3
-cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
-.skip_prologue:
-    mova    m5, [pb_7]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    psllq   m0, 56
-    ADD_HFYU_LEFT_LOOP 1, 1
-
-INIT_XMM ssse3
-cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, left
-    mova    m5, [pb_f]
-    mova    m6, [pb_zzzzzzzz77777777]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    pslldq  m0, 15
-    test    srcq, 15
-    jnz .src_unaligned
-    test    dstq, 15
-    jnz .dst_unaligned
-    ADD_HFYU_LEFT_LOOP 1, 1
-.dst_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 1
-.src_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 0
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 80e6cfbb12..26cf6214d8 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -1,132 +1,55 @@
 /*
  * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
-void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-                                    const uint8_t *diff, int w,
-                                    int *left, int *left_top);
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
 
-int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
-                                 int w, int left);
-int  ff_add_hfyu_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
-                                           int w, int left);
-
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
-static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
-                                      const uint8_t *diff, int w,
-                                      int *left, int *left_top)
-{
-    x86_reg w2 = -w;
-    x86_reg x;
-    int l  = *left     & 0xff;
-    int tl = *left_top & 0xff;
-    int t;
-    __asm__ volatile (
-        "mov          %7, %3            \n"
-        "1:                             \n"
-        "movzbl (%3, %4), %2            \n"
-        "mov          %2, %k3           \n"
-        "sub         %b1, %b3           \n"
-        "add         %b0, %b3           \n"
-        "mov          %2, %1            \n"
-        "cmp          %0, %2            \n"
-        "cmovg        %0, %2            \n"
-        "cmovg        %1, %0            \n"
-        "cmp         %k3, %0            \n"
-        "cmovg       %k3, %0            \n"
-        "mov          %7, %3            \n"
-        "cmp          %2, %0            \n"
-        "cmovl        %2, %0            \n"
-        "add    (%6, %4), %b0           \n"
-        "mov         %b0, (%5, %4)      \n"
-        "inc          %4                \n"
-        "jl           1b                \n"
-        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
-        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
-    );
-    *left     = l;
-    *left_top = tl;
-}
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
 {
     int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
 
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
-    if (cpu_flags & AV_CPU_FLAG_CMOV)
-        c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
-
-    if (INLINE_MMX(cpu_flags))
-        c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
-
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        /* slower than cmov version on AMD */
-        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
-            c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+        c->add_int16 = ff_add_int16_mmx;
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags)) {
-        c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
     }
 
-    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
-        c->add_hfyu_left_pred = ff_add_hfyu_left_pred_unaligned_ssse3;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
     }
 }
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
new file mode 100644
index 0000000000..1228aa8355
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -0,0 +1,143 @@
+;************************************************************************
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    unsigned mask, int w);
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    m4, maskd
+    SPLATW  m4, m4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    INT16_LOOP a, sub
+%endif
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    test src1q, mmsize-1
+    jnz .unaligned
+    test src2q, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, sub
+.unaligned:
+    INT16_LOOP u, sub
+
+INIT_MMX mmxext
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+    add      wd, wd
+    movd    mm7, maskd
+    SPLATW  mm7, mm7
+    movq    mm0, [src1q]
+    movq    mm2, [src2q]
+    psllq   mm0, 16
+    psllq   mm2, 16
+    movd    mm6, [left_topq]
+    por     mm0, mm6
+    movd    mm6, [leftq]
+    por     mm2, mm6
+    xor     maskq, maskq
+.loop:
+    movq    mm1, [src1q + maskq]
+    movq    mm3, [src2q + maskq]
+    movq    mm4, mm2
+    psubw   mm2, mm0
+    paddw   mm2, mm1
+    pand    mm2, mm7
+    movq    mm5, mm4
+    pmaxsw  mm4, mm1
+    pminsw  mm1, mm5
+    pminsw  mm4, mm2
+    pmaxsw  mm4, mm1
+    psubw   mm3, mm4
+    pand    mm3, mm7
+    movq    [dstq + maskq], mm3
+    add     maskq, 8
+    movq    mm0, [src1q + maskq - 2]
+    movq    mm2, [src2q + maskq - 2]
+    cmp     maskq, wq
+        jb .loop
+    movzx maskd, word [src1q + wq - 2]
+    mov [left_topq], maskd
+    movzx maskd, word [src2q + wq - 2]
+    mov [leftq], maskd
+    RET
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
new file mode 100644
index 0000000000..f66bc8c4f0
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * SIMD-optimized HuffYUV encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvencdsp.h"
+
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                                          unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
new file mode 100644
index 0000000000..089425a9ab
--- /dev/null
+++ b/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION .text
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                                  ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+    mova     m1, [blockq+mmsize*0+%1]
+    mova     m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m3, [blockq+mmsize*4+%1]
+    mova     m4, [blockq+mmsize*6+%1]
+%endif
+    packsswb m1, [blockq+mmsize*1+%1]
+    packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packsswb m3, [blockq+mmsize*5+%1]
+    packsswb m4, [blockq+mmsize*7+%1]
+%endif
+    paddb    m1, m0
+    paddb    m2, m0
+%if mmsize == 8
+    paddb    m3, m0
+    paddb    m4, m0
+    movq     [pixelsq+lsizeq*0], m1
+    movq     [pixelsq+lsizeq*1], m2
+    movq     [pixelsq+lsizeq*2], m3
+    movq     [pixelsq+lsize3q ], m4
+%else
+    movq     [pixelsq+lsizeq*0], m1
+    movhps   [pixelsq+lsizeq*1], m1
+    movq     [pixelsq+lsizeq*2], m2
+    movhps   [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+    mova     m0, [pb_80]
+    lea      lsize3q, [lsizeq*3]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+    lea      pixelsq, [pixelsq+lsizeq*4]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+    RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index 6e6c68857d..0d0bdb5f57 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,16 @@
 
 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                       ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
+
 
 #endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 853c6a3661..bcf7e5be0e 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,12 +64,10 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
-        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
-        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
-
         if (!high_bit_depth &&
+            avctx->lowres == 0 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct_put  = ff_simple_idct_put_mmx;
                 c->idct_add  = ff_simple_idct_add_mmx;
@@ -77,4 +75,52 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
     }
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+    }
+
+    if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (avctx->bits_per_raw_sample == 10 &&
+        (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLE)) {
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->idct_put  = ff_simple_idct10_put_sse2;
+            c->idct_add  = NULL;
+            c->idct      = ff_simple_idct10_sse2;
+            c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->idct_put  = ff_simple_idct10_put_avx;
+            c->idct_add  = NULL;
+            c->idct      = ff_simple_idct10_avx;
+            c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+        }
+
+        if (avctx->bits_per_raw_sample == 12 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+    }
 }
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
deleted file mode 100644
index 523f36816b..0000000000
--- a/libavcodec/x86/idctdsp_mmx.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * SIMD-optimized IDCT-related routines
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "idctdsp.h"
-#include "inline_asm.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               ptrdiff_t line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      ptrdiff_t line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               ptrdiff_t line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index f85e2e4cc3..960eabdda5 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,20 +2,20 @@
 ;* 36 point SSE-optimized IMDCT transform
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -50,7 +50,7 @@ ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
                dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
                dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
                dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
-               dd 1.0,  0.70710678118654752439,  0.0,  0.0
+               dd 1.0, -0.70710678118654752439,  0.0,  0.0
 
 costabs:  times 4 dd  0.98480773
           times 4 dd  0.93969262
@@ -129,7 +129,26 @@ SECTION .text
 %endif
 %endmacro
 
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xe1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xe1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
 %macro STORE 4
+%if cpuflag(sse4)
+    movss     [%3       ], %1
+    extractps dword [%3 +   %4], %1, 1
+    extractps dword [%3 + 2*%4], %1, 2
+    extractps dword [%3 + 3*%4], %1, 3
+%else
     movhlps %2, %1
     movss   [%3       ], %1
     movss   [%3 + 2*%4], %2
@@ -137,6 +156,7 @@ SECTION .text
     movss   [%3 +   %4], %1
     movhlps %2, %1
     movss   [%3 + 3*%4], %2
+%endif
 %endmacro
 
 %macro LOAD 4
@@ -279,11 +299,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     BUTTERF  m7, m2, 16
     BUTTERF  m3, m6, 32
     BUTTERF  m4, m1, 48
-
-    mulps   m5, m5, [ps_cosh + 64]
-    PSHUFD  m1, m5, 0xe1
-    xorps   m5, m5, [ps_p1m1p1m1]
-    addps   m5, m5, m1
+    BUTTERF2 m5, m1, 64
 
     ; permutates:
     ; m0    0  1  2  3     =>     2  6 10 14   m1
@@ -358,8 +374,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse2
 DEFINE_IMDCT
@@ -370,8 +388,10 @@ DEFINE_IMDCT
 INIT_XMM ssse3
 DEFINE_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse
 
@@ -716,5 +736,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
 INIT_XMM sse
 DEFINE_FOUR_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
index fc554bfc8d..0198746719 100644
--- a/libavcodec/x86/inline_asm.h
+++ b/libavcodec/x86/inline_asm.h
@@ -1,20 +1,20 @@
 /*
  * inline assembly helper macros
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
 
 #ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
 #else
 // for shared library it's better to use this way for accessing constants
 // pcmpeqd -> -1
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
new file mode 100644
index 0000000000..56b5fbd606
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -0,0 +1,144 @@
+;******************************************************************************
+;* SIMD-optimized JPEG2000 DSP functions
+;* Copyright (c) 2014 Nicolas Bertrand
+;* Copyright (c) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pf_ict0: times 8 dd 1.402
+pf_ict1: times 8 dd 0.34413
+pf_ict2: times 8 dd 0.71414
+pf_ict3: times 8 dd 1.772
+
+SECTION .text
+
+;***********************************************************************
+; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
+;***********************************************************************
+%macro ICT_FLOAT 1
+cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+    movaps   m6, [pf_ict0]
+    movaps   m7, [pf_ict1]
+    %define ICT0 m6
+    %define ICT1 m7
+
+%if ARCH_X86_64
+    movaps   m8, [pf_ict2]
+    %define ICT2 m8
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    movaps   m9, [pf_ict3]
+    %define ICT3 m9
+%endif
+
+%else ; ARCH_X86_32
+    %define ICT2 [pf_ict2]
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    %define ICT3 [pf_ict3]
+%endif
+
+%endif ; ARCH
+
+align 16
+.loop:
+    movaps   m0, [src0q+csizeq]
+    movaps   m1, [src1q+csizeq]
+    movaps   m2, [src2q+csizeq]
+
+%if cpuflag(avx)
+    mulps    m5, m1, ICT1
+    mulps    m4, m2, ICT0
+    mulps    m1, m1, ICT3
+    mulps    m2, m2, ICT2
+    subps    m5, m0, m5
+%else ; sse
+    movaps   m3, m1
+    movaps   m4, m2
+    movaps   m5, m0
+    mulps    m3, ICT1
+    mulps    m4, ICT0
+    mulps    m1, ICT3
+    mulps    m2, ICT2
+    subps    m5, m3
+%endif
+    addps    m4, m4, m0
+    addps    m0, m0, m1
+    subps    m5, m5, m2
+
+    movaps   [src0q+csizeq], m4
+    movaps   [src2q+csizeq], m0
+    movaps   [src1q+csizeq], m5
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+ICT_FLOAT 10
+INIT_YMM avx
+ICT_FLOAT 9
+
+;***************************************************************************
+; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
+;***************************************************************************
+%macro RCT_INT 0
+cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+
+align 16
+.loop:
+    mova   m1, [src1q+csizeq]
+    mova   m2, [src2q+csizeq]
+    mova   m0, [src0q+csizeq]
+    paddd  m3, m1, m2
+    psrad  m3, 2
+    psubd  m0, m3
+    paddd  m1, m0
+    paddd  m2, m0
+    mova   [src1q+csizeq], m0
+    mova   [src2q+csizeq], m1
+    mova   [src0q+csizeq], m2
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RCT_INT
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RCT_INT
+%endif
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
new file mode 100644
index 0000000000..baa81383ea
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -0,0 +1,50 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+    }
+}
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index d6abd982e8..063d7b41af 100644
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     mova    [v1q + orderq + mmsize], m3
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
 %endmacro
@@ -75,6 +68,39 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 
+INIT_XMM sse4
+; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    SPLATW  m7, m7
+    pxor    m6, m6
+    add v1q, orderq
+    lea v2q, [v2q + 2*orderq]
+    add v3q, orderq
+    neg orderq
+.loop:
+    mova    m3, [v1q + orderq]
+    movu    m0, [v2q + 2*orderq]
+    pmovsxwd m4, m3
+    movu    m1, [v2q + 2*orderq + mmsize]
+    movhlps m5, m3
+    movu    m2, [v3q + orderq]
+    pmovsxwd m5, m5
+    pmullw  m2, m7
+    pmulld  m0, m4
+    pmulld  m1, m5
+    paddw   m2, m3
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    add     orderq, 16
+    jl .loop
+    HADDD   m6, m0
+    movd   eax, m6
+    RET
+
 %macro SCALARPRODUCT_LOOP 1
 align 16
 .loop%1:
@@ -159,9 +185,6 @@ SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index f692c2b9b6..10b6a65622 100644
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -1,25 +1,25 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                                const int16_t *v3,
@@ -31,8 +31,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
-av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags))
@@ -44,4 +49,8 @@ av_cold void ff_apedsp_init_x86(APEDSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags) &&
         !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
+#endif
 }
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000000..443fe02951
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,290 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_ef: times 8 db 14,15
+pb_67: times 8 db  6, 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION .text
+
+; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+;                                const uint8_t *diff, int w,
+;                                int *left, int *left_top)
+%macro MEDIAN_PRED 0
+cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+    movu    m0, [topq]
+    mova    m2, m0
+    movd    m4, [left_topq]
+    LSHIFT  m2, 1
+    mova    m1, m0
+    por     m4, m2
+    movd    m3, [leftq]
+    psubb   m0, m4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movu    m4, [topq+wq]
+    mova    m0, m4
+    LSHIFT  m4, 1
+    por     m4, m1
+    mova    m1, m0 ; t
+    psubb   m0, m4 ; t-tl
+.skip:
+    movu    m2, [diffq+wq]
+%assign i 0
+%rep mmsize
+    mova    m4, m0
+    paddb   m4, m3 ; t-tl+l
+    mova    m5, m3
+    pmaxub  m3, m1
+    pminub  m5, m1
+    pminub  m3, m4
+    pmaxub  m3, m5 ; median
+    paddb   m3, m2 ; +residual
+%if i==0
+    mova    m7, m3
+    LSHIFT  m7, mmsize-1
+%else
+    mova    m6, m3
+    RSHIFT  m7, 1
+    LSHIFT  m6, mmsize-1
+    por     m7, m6
+%endif
+%if i<mmsize-1
+    RSHIFT  m0, 1
+    RSHIFT  m1, 1
+    RSHIFT  m2, 1
+%endif
+%assign i i+1
+%endrep
+    movu [dstq+wq], m7
+    add      wq, mmsize
+    jl .loop
+    movzx   r2d, byte [dstq-1]
+    mov [leftq], r2d
+    movzx   r2d, byte [topq-1]
+    mov [left_topq], r2d
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+MEDIAN_PRED
+%endif
+INIT_XMM sse2
+MEDIAN_PRED
+
+
+%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+%if %2
+    mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
+    mova    m2, m1
+    psllw   m1, 8
+    paddb   m1, m2
+    mova    m2, m1
+    pshufb  m1, m3
+    paddb   m1, m2
+    pshufb  m0, m5
+    mova    m2, m1
+    pshufb  m1, m4
+    paddb   m1, m2
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m6
+    paddb   m1, m2
+%endif
+    paddb   m0, m1
+%if %1
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
+INIT_MMX ssse3
+cglobal add_left_pred, 3,3,7, dst, src, w, left
+.skip_prologue:
+    mova    m5, [pb_7]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    psllq   m0, 56
+    ADD_LEFT_LOOP 1, 1
+
+INIT_XMM ssse3
+cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
+    mova    m5, [pb_15]
+    mova    m6, [pb_zzzzzzzz77777777]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    pslldq  m0, 15
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3:
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+    add     wd, wd
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mov%2   m1, [srcq+wq]
+    mova    m2, m1
+    pslld   m1, 16
+    paddw   m1, m2
+    mova    m2, m1
+
+    pshufb  m1, m3
+    paddw   m1, m2
+    pshufb  m0, m5
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m4
+    paddw   m1, m2
+%endif
+    paddw   m0, m1
+    pand    m0, m7
+%ifidn %1, a
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    mov     wd, eax
+    shl     wd, 8
+    lea     eax, [wd+eax-1]
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+INIT_MMX ssse3
+cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+    mova    m5, [pb_67]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    psllq   m0, 48
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM sse4
+cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
+    mova    m5, [pb_ef]
+    mova    m4, [pb_zzzzzzzz67676767]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    pslldq  m0, 14
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, u
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000000..21bbd12bd2
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,118 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/x86/asm.h"
+#include "../lossless_videodsp.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+
+void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+                               const uint8_t *diff, ptrdiff_t w,
+                               int *left, int *left_top);
+void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                             const uint8_t *diff, ptrdiff_t w,
+                             int *left, int *left_top);
+
+int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+                            ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t w, int left);
+
+int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                 const uint8_t *diff, ptrdiff_t w,
+                                 int *left, int *left_top)
+{
+    x86_reg w2 = -w;
+    x86_reg x;
+    int l  = *left     & 0xff;
+    int tl = *left_top & 0xff;
+    int t;
+    __asm__ volatile (
+        "mov          %7, %3            \n"
+        "1:                             \n"
+        "movzbl (%3, %4), %2            \n"
+        "mov          %2, %k3           \n"
+        "sub         %b1, %b3           \n"
+        "add         %b0, %b3           \n"
+        "mov          %2, %1            \n"
+        "cmp          %0, %2            \n"
+        "cmovg        %0, %2            \n"
+        "cmovg        %1, %0            \n"
+        "cmp         %k3, %0            \n"
+        "cmovg       %k3, %0            \n"
+        "mov          %7, %3            \n"
+        "cmp          %2, %0            \n"
+        "cmovl        %2, %0            \n"
+        "add    (%6, %4), %b0           \n"
+        "mov         %b0, (%5, %4)      \n"
+        "inc          %4                \n"
+        "jl           1b                \n"
+        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
+    );
+    *left     = l;
+    *left_top = tl;
+}
+#endif
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+    if (cpu_flags & AV_CPU_FLAG_CMOV)
+        c->add_median_pred = add_median_pred_cmov;
+#endif
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+    }
+
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
+        /* slower than cmov version on AMD */
+        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+            c->add_median_pred = ff_add_median_pred_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_sse2;
+        c->add_median_pred = ff_add_median_pred_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+    }
+
+    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->add_left_pred_int16 = ff_add_left_pred_int16_sse4;
+    }
+}
diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm
new file mode 100644
index 0000000000..63fd72174a
--- /dev/null
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@ -0,0 +1,150 @@
+;************************************************************************
+;* SIMD-optimized lossless video encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+    DECLARE_REG_TMP 3
+    mov               wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+    DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; label to jump to if w < regsize
+%macro DIFF_BYTES_LOOP_PREP 1
+    mov                i, wq
+    and                i, -2 * regsize
+        jz            %1
+    add             dstq, i
+    add            src1q, i
+    add            src2q, i
+    neg                i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+    mov%1             %3, [src1q + i]
+    mov%1             %4, [src1q + i + regsize]
+    psubb             %3, [src2q + i]
+    psubb             %4, [src2q + i + regsize]
+    mov%2           [dstq + i], %3
+    mov%2 [regsize + dstq + i], %4
+%else
+    ; SSE enforces alignment of psubb operand
+    mov%1             %3, [src1q + i]
+    movu              %4, [src2q + i]
+    psubb             %3, %4
+    mov%2     [dstq + i], %3
+    mov%1             %3, [src1q + i + regsize]
+    movu              %4, [src2q + i + regsize]
+    psubb             %3, %4
+    mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+    %define regsize mmsize
+.loop_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+    add                i, 2 * regsize
+        jl    .loop_%1%2
+.skip_main_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%if mmsize > 16
+    ; fall back to narrower xmm
+    %define regsize mmsize / 2
+    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
+.loop2_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+    add                i, 2 * regsize
+        jl   .loop2_%1%2
+.setup_loop_gpr_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%endif
+    add             dstq, wq
+    add            src1q, wq
+    add            src2q, wq
+    neg               wq
+.loop_gpr_%1%2:
+    mov              t0b, [src1q + wq]
+    sub              t0b, [src2q + wq]
+    mov      [dstq + wq], t0b
+    inc               wq
+        jl .loop_gpr_%1%2
+.end_%1%2:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
+    DIFF_BYTES_BODY    a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    ; Directly using unaligned SSE2 version is marginally faster than
+    ; branching based on arguments.
+    DIFF_BYTES_LOOP_PREP .skip_main_uu
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/lossless_videoencdsp_init.c
index 8ffaced37d..fc728c9fd1 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/lossless_videoencdsp_init.c
@@ -1,24 +1,24 @@
 /*
- * SIMD-optimized HuffYUV encoding functions
+ * SIMD-optimized lossless video encoding functions
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,38 +26,21 @@
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/huffyuvencdsp.h"
+#include "libavcodec/lossless_videoencdsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_INLINE_ASM
-
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "1:                             \n\t"
-        "movq  (%2, %0), %%mm0          \n\t"
-        "movq  (%1, %0), %%mm1          \n\t"
-        "psubb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, (%3, %0)           \n\t"
-        "movq 8(%2, %0), %%mm0          \n\t"
-        "movq 8(%1, %0), %%mm1          \n\t"
-        "psubb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, 8(%3, %0)          \n\t"
-        "add $16, %0                    \n\t"
-        "cmp %4, %0                     \n\t"
-        " jb 1b                         \n\t"
-        : "+r" (i)
-        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w - 15));
+void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                       intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
 
-    for (; i < w; i++)
-        dst[i + 0] = src1[i + 0] - src2[i + 0];
-}
+#if HAVE_INLINE_ASM
 
-static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
-                                        const uint8_t *src2, int w,
-                                        int *left, int *left_top)
+static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
+                                   const uint8_t *src2, intptr_t w,
+                                   int *left, int *left_top)
 {
     x86_reg i = 0;
     uint8_t l, lt;
@@ -97,17 +80,25 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
 
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c)
+av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
 {
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->diff_bytes = diff_bytes_mmx;
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_mmx;
     }
 
+#if HAVE_INLINE_ASM
     if (INLINE_MMXEXT(cpu_flags)) {
-        c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
+        c->sub_median_pred = sub_median_pred_mmxext;
     }
 #endif /* HAVE_INLINE_ASM */
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_avx2;
+    }
 }
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index e8cce42af4..6c72e21bac 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
  * SIMD-optimized LPC functions
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
         "3:                                    \n\t"
         :"+&r"(i), "+&r"(j)
         :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                     "%xmm5", "%xmm6", "%xmm7")
     );
@@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm2, 16(%1)           \n\t"
                 :"+&r"(i)
                 :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
                 :"memory"
             );
         } else {
@@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm1, %2               \n\t"
                 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
                 :"r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
             );
         }
     }
@@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
 #if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_SSE2_SLOW(cpu_flags)) {
+    if (INLINE_SSE2(cpu_flags) || INLINE_SSE2_SLOW(cpu_flags)) {
         c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
         c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
     }
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index 2c04d9d1bd..6298f5ed19 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ __asm__ volatile(\
 #endif /* HAVE_I686 */
 
 #define MASK_ABS(mask, level)                   \
-    __asm__ ("cltd                   \n\t"      \
+    __asm__ ("cdq                    \n\t"      \
              "xorl %1, %0            \n\t"      \
              "subl %1, %0            \n\t"      \
              : "+a"(level), "=&d"(mask))
diff --git a/libavcodec/x86/mdct.h b/libavcodec/x86/mdct.h
deleted file mode 100644
index cc107cb86a..0000000000
--- a/libavcodec/x86/mdct.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_MDCT_H
-#define AVCODEC_X86_MDCT_H
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-#endif /* AVCODEC_X86_MDCT_H */
diff --git a/libavcodec/x86/mdct_init.c b/libavcodec/x86/mdct_init.c
deleted file mode 100644
index db642d863d..0000000000
--- a/libavcodec/x86/mdct_init.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-
-#include "mdct.h"
-
-av_cold void ff_mdct_init_x86(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
-    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
-        s->imdct_calc = ff_imdct_calc_3dnow;
-        s->imdct_half = ff_imdct_half_3dnow;
-    }
-
-    if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
-        s->imdct_calc = ff_imdct_calc_3dnowext;
-        s->imdct_half = ff_imdct_half_3dnowext;
-    }
-#endif /* ARCH_X86_32 */
-
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->imdct_calc  = ff_imdct_calc_sse;
-        s->imdct_half  = ff_imdct_half_sse;
-    }
-
-    if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
-        s->imdct_half      = ff_imdct_half_avx;
-    }
-}
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 1a87f37b39..ad06d485ab 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -4,25 +4,30 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+cextern pb_1
+cextern pb_80
+
 SECTION .text
 
 %macro DIFF_PIXELS_1 4
@@ -210,7 +215,7 @@ hadamard8_16_wrapper %1, 3
 %elif cpuflag(mmx)
 ALIGN 16
 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
-;                               uint8_t *src2, int stride, int h)
+;                               uint8_t *src2, ptrdiff_t stride, int h)
 ; r0 = void *s = unused, int h = unused (always 8)
 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
 ; can simply call this 2x2x (and that's why we access rsp+gprsize
@@ -274,19 +279,27 @@ INIT_XMM ssse3
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
 HADAMARD8_DIFF 9
 
-INIT_XMM sse2
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-;                   int line_size, int h);
-cglobal sse16, 5, 5, 8
-    shr      r4d, 1
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;               ptrdiff_t line_size, int h)
+
+%macro SUM_SQUARED_ERRORS 1
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
+%if %1 == mmsize
+    shr       hd, 1
+%endif
     pxor      m0, m0         ; mm0 = 0
     pxor      m7, m7         ; mm7 holds the sum
 
 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
-    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
-    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
-    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
-    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
+    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
+    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
+%if %1 == mmsize
+    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
+    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
+%else  ; %1 / 2 == mmsize; mmx only
+    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
+    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
+%endif
 
     ; todo: mm1-mm2, mm3-mm4
     ; algo: subtract mm1 from mm2 with saturation and vice versa
@@ -315,22 +328,607 @@ cglobal sse16, 5, 5, 8
     pmaddwd   m1, m1
     pmaddwd   m3, m3
 
-    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
-    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
-
     paddd     m1, m2
     paddd     m3, m4
     paddd     m7, m1
     paddd     m7, m3
 
-    dec       r4
+%if %1 == mmsize
+    lea    pix1q, [pix1q + 2*lsizeq]
+    lea    pix2q, [pix2q + 2*lsizeq]
+%else
+    add    pix1q, lsizeq
+    add    pix2q, lsizeq
+%endif
+    dec       hd
     jnz .next2lines
 
-    mova      m1, m7
-    psrldq    m7, 8          ; shift hi qword to lo
-    paddd     m7, m1
-    mova      m1, m7
-    psrldq    m7, 4          ; shift hi dword to lo
-    paddd     m7, m1
+    HADDD     m7, m1
     movd     eax, m7         ; return value
     RET
+%endmacro
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 8
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 16
+
+INIT_XMM sse2
+SUM_SQUARED_ERRORS 16
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline loops
+
+%macro SUM_ABS_DCTELEM 2
+cglobal sum_abs_dctelem, 1, 1, %1, block
+    pxor    m0, m0
+    pxor    m1, m1
+%assign %%i 0
+%rep %2
+    mova      m2, [blockq+mmsize*(0+%%i)]
+    mova      m3, [blockq+mmsize*(1+%%i)]
+    mova      m4, [blockq+mmsize*(2+%%i)]
+    mova      m5, [blockq+mmsize*(3+%%i)]
+    ABS1_SUM  m2, m6, m0
+    ABS1_SUM  m3, m6, m1
+    ABS1_SUM  m4, m6, m0
+    ABS1_SUM  m5, m6, m1
+%assign %%i %%i+4
+%endrep
+    paddusw m0, m1
+    HSUM    m0, m1, eax
+    and     eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0, 4
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0, 4
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7, 2
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+    mova      m%2, [pix1q]
+%if %1 == 8
+    mova      m%3, m%2
+    psllq     m%2, 8
+    psrlq     m%3, 8
+    psrlq     m%2, 8
+%else
+    mova      m%3, [pix1q+1]
+%endif
+    mova      m%4, m%2
+    mova      m%5, m%3
+    punpcklbw m%2, m7
+    punpcklbw m%3, m7
+    punpckhbw m%4, m7
+    punpckhbw m%5, m7
+    psubw     m%2, m%3
+    psubw     m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 4
+    psubw     m%1, m%3
+    psubw     m%2, m%4
+    pxor       m3, m3
+    pxor       m1, m1
+    pcmpgtw    m3, m%1
+    pcmpgtw    m1, m%2
+    pxor      m%1, m3
+    pxor      m%2, m1
+    psubw     m%1, m3
+    psubw     m%2, m1
+    paddw     m%2, m%1
+    paddw      m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+    sub        hd, 2
+    pxor       m7, m7
+    pxor       m6, m6
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+.loop:
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    HF_NOISE_PART2     4, 5, 0, 2
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+    sub        hd, 2
+        jne .loop
+
+    mova       m0, m6
+    punpcklwd  m0, m7
+    punpckhwd  m6, m7
+    paddd      m6, m0
+    mova       m0, m6
+    psrlq      m6, 32
+    paddd      m0, m6
+    movd      eax, m0   ; eax = result of hf_noise8;
+    REP_RET                 ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;---------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+    movu      m2, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m2, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+strideq*2]
+    lea    pix2q, [pix2q+strideq*2]
+    movu      m0, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m0, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m0
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m0, m2
+    paddw     m2, m0
+%endif
+    movd     eax, m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+    movu      m0, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m0, m3
+    pavgb     m2, m4
+%else
+    pavgb     m0, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m0, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m1, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m1, m3
+    pavgb     m2, m4
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m4, [pix2q+8]
+    movu      m5, [pix2q+strideq+8]
+    movu      m6, [pix2q+2*strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;-------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+    mova      m4, [pb_1]
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    movu      m2, [pix2q+2*strideq+1]
+    pavgb     m1, m5
+    pavgb     m0, m6
+    pavgb     m3, m2
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m0, [pix2q+strideq+1]
+    pavgb     m3, [pix2q+2*strideq+1]
+%endif
+    psubusb   m0, m4
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    movu      m7, [pix2q+2*strideq+8]
+    pavgb     m5, [pix2q+1+8]
+    pavgb     m6, [pix2q+strideq+1+8]
+    pavgb     m7, [pix2q+2*strideq+1+8]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    pavgb     m2, m5
+    pavgb     m3, m6
+%else
+    pavgb     m2, [pix2q+1]
+    pavgb     m3, [pix2q+strideq+1]
+%endif
+    psubusb   m2, m4
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m6, [pix2q+8]
+    movu      m7, [pix2q+strideq+8]
+    pavgb     m6, [pix2q+8+1]
+    pavgb     m7, [pix2q+strideq+8+1]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
+INIT_XMM sse2
+SAD_APPROX_XY2 16
+
+;--------------------------------------------------------------------
+;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                  ptrdiff_t line_size, int h);
+;--------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_INTRA 1
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
+    mova      m0, [pix1q]
+%if %1 == mmsize
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m0, m2
+%else
+    mova      m2, [pix1q+lsizeq]
+    mova      m3, [pix1q+8]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m0, m2
+    psadbw    m3, m4
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+
+.loop:
+    lea    pix1q, [pix1q + 2*lsizeq]
+%if %1 == mmsize
+    mova      m1, [pix1q]
+    psadbw    m2, m1
+    paddw     m0, m2
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m1, m2
+    paddw     m0, m1
+%else
+    mova      m1, [pix1q]
+    mova      m3, [pix1q+8]
+    psadbw    m2, m1
+    psadbw    m4, m3
+    paddw     m0, m2
+    paddw     m0, m4
+    mova      m2, [pix1q+lsizeq]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m1, m2
+    psadbw    m3, m4
+    paddw     m0, m1
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+    jg     .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_INTRA 8
+VSAD_INTRA 16
+INIT_XMM sse2
+VSAD_INTRA 16
+
+;---------------------------------------------------------------------
+;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                   ptrdiff_t line_size, int h);
+;---------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_APPROX 1
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+    mova   m1, [pb_80]
+    mova   m0, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+    mova   m4, [pix1q+lsizeq]
+%if mmsize == 16
+    movu   m3, [pix2q]
+    movu   m2, [pix2q+lsizeq]
+    psubb  m0, m3
+    psubb  m4, m2
+%else
+    psubb  m0, [pix2q]
+    psubb  m4, [pix2q+lsizeq]
+%endif
+    pxor   m0, m1
+    pxor   m4, m1
+    psadbw m0, m4
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m0, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m0, m1
+    pxor   m3, m1
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m0, m4
+    psadbw m3, m5
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+
+.loop:
+    lea pix1q, [pix1q + 2*lsizeq]
+    lea pix2q, [pix2q + 2*lsizeq]
+    mova   m2, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+%if mmsize == 16
+    movu   m3, [pix2q]
+    psubb  m2, m3
+%else
+    psubb  m2, [pix2q]
+%endif
+    pxor   m2, m1
+    psadbw m4, m2
+    paddw  m0, m4
+    mova   m4, [pix1q+lsizeq]
+    movu   m3, [pix2q+lsizeq]
+    psubb  m4, m3
+    pxor   m4, m1
+    psadbw m2, m4
+    paddw  m0, m2
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m2, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m2, m1
+    pxor   m3, m1
+    psadbw m4, m2
+    psadbw m5, m3
+    paddw  m0, m4
+    paddw  m0, m5
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m2, m4
+    psadbw m3, m5
+    paddw  m0, m2
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+    jg  .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd  eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_APPROX 8
+VSAD_APPROX 16
+INIT_XMM sse2
+VSAD_APPROX 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index ee5f559547..dc3e6f8668 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,382 +29,67 @@
 #include "libavcodec/me_cmp.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                    ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl         %4, %%ecx          \n"
-        "shr          $1, %%ecx          \n"
-        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
-        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
-        "1:                              \n"
-        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
-        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
-        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
-        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq      %%mm1, %%mm5          \n"
-        "movq      %%mm3, %%mm6          \n"
-        "psubusb   %%mm2, %%mm1          \n"
-        "psubusb   %%mm4, %%mm3          \n"
-        "psubusb   %%mm5, %%mm2          \n"
-        "psubusb   %%mm6, %%mm4          \n"
-
-        "por       %%mm1, %%mm2          \n"
-        "por       %%mm3, %%mm4          \n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq      %%mm2, %%mm1          \n"
-        "movq      %%mm4, %%mm3          \n"
-
-        "punpckhbw %%mm0, %%mm2          \n"
-        "punpckhbw %%mm0, %%mm4          \n"
-        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd   %%mm2, %%mm2          \n"
-        "pmaddwd   %%mm4, %%mm4          \n"
-        "pmaddwd   %%mm1, %%mm1          \n"
-        "pmaddwd   %%mm3, %%mm3          \n"
-
-        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * stride */
-        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * stride */
-
-        "paddd     %%mm2, %%mm1          \n"
-        "paddd     %%mm4, %%mm3          \n"
-        "paddd     %%mm1, %%mm7          \n"
-        "paddd     %%mm3, %%mm7          \n"
-
-        "decl      %%ecx                 \n"
-        "jnz       1b                    \n"
-
-        "movq      %%mm7, %%mm1          \n"
-        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
-        "paddd     %%mm7, %%mm1          \n"
-        "movd      %%mm1, %2             \n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                     ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
-        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
-        "1:\n"
-        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
-        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
-        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
-        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq %%mm1, %%mm5\n"
-        "movq %%mm3, %%mm6\n"
-        "psubusb %%mm2, %%mm1\n"
-        "psubusb %%mm4, %%mm3\n"
-        "psubusb %%mm5, %%mm2\n"
-        "psubusb %%mm6, %%mm4\n"
-
-        "por %%mm1, %%mm2\n"
-        "por %%mm3, %%mm4\n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq %%mm2, %%mm1\n"
-        "movq %%mm4, %%mm3\n"
-
-        "punpckhbw %%mm0, %%mm2\n"
-        "punpckhbw %%mm0, %%mm4\n"
-        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd %%mm2, %%mm2\n"
-        "pmaddwd %%mm4, %%mm4\n"
-        "pmaddwd %%mm1, %%mm1\n"
-        "pmaddwd %%mm3, %%mm3\n"
-
-        "add %3, %0\n"
-        "add %3, %1\n"
-
-        "paddd %%mm2, %%mm1\n"
-        "paddd %%mm4, %%mm3\n"
-        "paddd %%mm1, %%mm7\n"
-        "paddd %%mm3, %%mm7\n"
-
-        "decl %%ecx\n"
-        "jnz 1b\n"
-
-        "movq %%mm7, %%mm1\n"
-        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
-        "paddd %%mm7, %%mm1\n"
-        "movd %%mm1, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor  %%mm3, %%mm3\n"
-        "pxor  %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor  %%mm3, %%mm4\n"
-        "pxor  %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq      %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq      %%mm4, %%mm5\n"
-        "movq      %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw     %%mm1, %%mm4\n"
-        "psubw     %%mm3, %%mm5\n"
-        "psubw     %%mm4, %%mm0\n"
-        "psubw     %%mm5, %%mm2\n"
-        "pxor      %%mm3, %%mm3\n"
-        "pxor      %%mm1, %%mm1\n"
-        "pcmpgtw   %%mm0, %%mm3\n\t"
-        "pcmpgtw   %%mm2, %%mm1\n\t"
-        "pxor      %%mm3, %%mm0\n"
-        "pxor      %%mm1, %%mm2\n"
-        "psubw     %%mm3, %%mm0\n"
-        "psubw     %%mm1, %%mm2\n"
-        "paddw     %%mm0, %%mm2\n"
-        "paddw     %%mm2, %%mm6\n"
-
-        "add  %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq      %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd     %%mm0, %%mm6\n"
-
-        "movq  %%mm6, %%mm0\n"
-        "psrlq $32,   %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd  %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-    uint8_t *pix = pix1;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor %%mm3, %%mm4\n"
-        "pxor %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd %%mm0, %%mm6\n"
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                ptrdiff_t stride, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                 ptrdiff_t stride, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                              ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                               ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                          ptrdiff_t stride, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                           ptrdiff_t stride, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
 
-        "movq %%mm6, %%mm0\n"
-        "psrlq $32, %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
+#define hadamard_func(cpu)                                                    \
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
+                                  uint8_t *src2, ptrdiff_t stride, int h);    \
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
+                                    uint8_t *src2, ptrdiff_t stride, int h);
 
-    return tmp + hf_noise8_mmx(pix + 8, stride, h);
-}
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
 
+#if HAVE_YASM
 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
@@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
     if (c)
         score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
     else
-        score1 = sse16_mmx(c, pix1, pix2, stride, h);
-    score2 = hf_noise16_mmx(pix1, stride, h) -
-             hf_noise16_mmx(pix2, stride, h);
+        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
+    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
+           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                      ptrdiff_t stride, int h)
 {
-    int score1 = sse8_mmx(c, pix1, pix2, stride, h);
-    int score2 = hf_noise8_mmx(pix1, stride, h) -
-                 hf_noise8_mmx(pix2, stride, h);
+    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
+    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
+                 ff_hf_noise8_mmx(pix2, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
         return score1 + FFABS(score2) * 8;
 }
 
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
+
 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
                             ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)               \
     "movq (%0), %%mm2\n"                        \
@@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 }
 #undef SUM
 
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
-                               ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq 8(%0), " #out1 "\n"                   \
-    "add %2, %0\n"                              \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n"
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pxor %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq 8(%0), %%mm1\n"
-        "add %2, %0\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %1\n"
-        : "+r" (pix), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix1) & 7) == 0);
+    av_assert2((((int) pix2) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)       \
     "movq (%0), %%mm2\n"                \
@@ -624,191 +270,16 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 }
 #undef SUM
 
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                         ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq (%1), %%mm2\n"                        \
-    "movq 8(%0), " #out1 "\n"                   \
-    "movq 8(%1), %%mm3\n"                       \
-    "add %3, %0\n"                              \
-    "add %3, %1\n"                              \
-    "psubb %%mm2, " #out0 "\n"                  \
-    "psubb %%mm3, " #out1 "\n"                  \
-    "pxor %%mm7, " #out0 "\n"                   \
-    "pxor %%mm7, " #out1 "\n"                   \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n    "
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pcmpeqw %%mm7, %%mm7\n"
-        "psllw $15, %%mm7\n"
-        "packsswb %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq (%1), %%mm2\n"
-        "movq 8(%0), %%mm1\n"
-        "movq 8(%1), %%mm3\n"
-        "add %3, %0\n"
-        "add %3, %1\n"
-        "psubb %%mm2, %%mm0\n"
-        "psubb %%mm3, %%mm1\n"
-        "pxor %%mm7, %%mm0\n"
-        "pxor %%mm7, %%mm1\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
-#define MMABS_MMX(a,z)                          \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "pcmpgtw " #a ", " #z "             \n\t"   \
-    "pxor "    #z ", " #a "             \n\t"   \
-    "psubw "   #z ", " #a "             \n\t"
-
-#define MMABS_MMXEXT(a, z)                      \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "psubw "   #a ", " #z "             \n\t"   \
-    "pmaxsw "  #z ", " #a "             \n\t"
-
-#define MMABS_SSSE3(a,z)                        \
-    "pabsw "   #a ", " #a "             \n\t"
-
-#define MMABS_SUM(a,z, sum)                     \
-    MMABS(a,z)                                  \
-    "paddusw " #a ", " #sum "           \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst)                     \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $32, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $16, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_MMXEXT(a, t, dst)                  \
-    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_SSE2(a, t, dst)                    \
-    "movhlps " #a ", " #t "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define DCT_SAD4(m, mm, o)                      \
-    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
-    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
-    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
-    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
-    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
-    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
-
-#define DCT_SAD_MMX                             \
-    "pxor    %%mm0, %%mm0               \n\t"   \
-    "pxor    %%mm1, %%mm1               \n\t"   \
-    DCT_SAD4(q, %%mm, 0)                        \
-    DCT_SAD4(q, %%mm, 8)                        \
-    DCT_SAD4(q, %%mm, 64)                       \
-    DCT_SAD4(q, %%mm, 72)                       \
-    "paddusw %%mm1, %%mm0               \n\t"   \
-    HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2                            \
-    "pxor    %%xmm0, %%xmm0             \n\t"   \
-    "pxor    %%xmm1, %%xmm1             \n\t"   \
-    DCT_SAD4(dqa, %%xmm, 0)                     \
-    DCT_SAD4(dqa, %%xmm, 64)                    \
-    "paddusw %%xmm1, %%xmm0             \n\t"   \
-    HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu)                           \
-static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
-{                                                   \
-    int sum;                                        \
-    __asm__ volatile (                              \
-        DCT_SAD                                     \
-        :"=r"(sum)                                  \
-        :"r"(block));                               \
-    return sum & 0xFFFF;                            \
-}
-
-#define DCT_SAD         DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z)     MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z)     MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD         DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z)     MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-
 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
     0x0000000000000000ULL,
     0x0001000100010001ULL,
     0x0002000200020002ULL,
 };
 
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -841,133 +312,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
         : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
 }
 
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
-                      ptrdiff_t stride, int h)
-{
-    int ret;
-    __asm__ volatile (
-        "pxor %%xmm2, %%xmm2            \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movdqu (%1), %%xmm0            \n\t"
-        "movdqu (%1, %4), %%xmm1        \n\t"
-        "psadbw (%2), %%xmm0            \n\t"
-        "psadbw (%2, %4), %%xmm1        \n\t"
-        "paddw %%xmm0, %%xmm2           \n\t"
-        "paddw %%xmm1, %%xmm2           \n\t"
-        "lea (%1,%4,2), %1              \n\t"
-        "lea (%2,%4,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        "movhlps %%xmm2, %%xmm0         \n\t"
-        "paddw   %%xmm0, %%xmm2         \n\t"
-        "movd    %%xmm2, %3             \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
-        : "r" (stride));
-    return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "pavgb 1(%1, %3), %%mm1         \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq (%1), %%mm0               \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq "MANGLE(bone)", %%mm5     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1,%3), %%mm2            \n\t"
-        "pavgb 1(%1), %%mm1             \n\t"
-        "pavgb 1(%1,%3), %%mm2          \n\t"
-        "psubusb %%mm5, %%mm1           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2,%3), %%mm1          \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -1006,7 +354,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
         "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
@@ -1030,7 +378,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddw %%mm4, %%mm2             \n\t"
         "paddw %%mm5, %%mm3             \n\t"
-        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+        "movq %5, %%mm5                 \n\t"
         "paddw %%mm2, %%mm0             \n\t"
         "paddw %%mm3, %%mm1             \n\t"
         "paddw %%mm5, %%mm0             \n\t"
@@ -1054,7 +402,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         " js 1b                         \n\t"
         : "+a" (len)
         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
-          "r" (stride));
+          "r" (stride), "m" (round_tab[2]));
 }
 
 static inline int sum_mmx(void)
@@ -1072,15 +420,6 @@ static inline int sum_mmx(void)
     return ret & 0xFFFF;
 }
 
-static inline int sum_mmxext(void)
-{
-    int ret;
-    __asm__ volatile (
-        "movd %%mm6, %0                 \n\t"
-        : "=r" (ret));
-    return ret;
-}
-
 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
                                 ptrdiff_t stride, int h)
 {
@@ -1097,7 +436,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
                         uint8_t *blk1, ptrdiff_t stride, int h)         \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1111,7 +450,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1126,7 +465,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1141,7 +480,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
                             uint8_t *blk1, ptrdiff_t stride, int h)     \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1211,32 +550,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
 }                                                                       \
 
 PIX_SAD(mmx)
-PIX_SAD(mmxext)
 
 #endif /* HAVE_INLINE_ASM */
 
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                  ptrdiff_t stride, int h);
-
-#define hadamard_func(cpu)                                                    \
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
-                                  uint8_t *src2, ptrdiff_t stride, int h);    \
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
-                                    uint8_t *src2, ptrdiff_t stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMX(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
         c->pix_abs[0][0] = sad16_mmx;
         c->pix_abs[0][1] = sad16_x2_mmx;
         c->pix_abs[0][2] = sad16_y2_mmx;
@@ -1249,77 +571,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
         c->sad[0] = sad16_mmx;
         c->sad[1] = sad8_mmx;
 
-        c->sse[0]  = sse16_mmx;
-        c->sse[1]  = sse8_mmx;
         c->vsad[4] = vsad_intra16_mmx;
 
-        c->nsse[0] = nsse16_mmx;
-        c->nsse[1] = nsse8_mmx;
-
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
             c->vsad[0] = vsad16_mmx;
         }
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
-
-        c->vsad[4] = vsad_intra16_mmxext;
-
-        c->pix_abs[0][0] = sad16_mmxext;
-        c->pix_abs[1][0] = sad8_mmxext;
-
-        c->sad[0] = sad16_mmxext;
-        c->sad[1] = sad8_mmxext;
-
-        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->pix_abs[0][1] = sad16_x2_mmxext;
-            c->pix_abs[0][2] = sad16_y2_mmxext;
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
-            c->pix_abs[1][1] = sad8_x2_mmxext;
-            c->pix_abs[1][2] = sad8_y2_mmxext;
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
-
-            c->vsad[0] = vsad16_mmxext;
-        }
-    }
-
-    if (INLINE_SSE2(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
-    }
-
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
-        c->sad[0] = sad16_sse2;
-    }
-
-#if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
-    }
-#endif
 #endif /* HAVE_INLINE_ASM */
 
     if (EXTERNAL_MMX(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
+        c->sse[0]            = ff_sse16_mmx;
+        c->sse[1]            = ff_sse8_mmx;
+#if HAVE_YASM
+        c->nsse[0]           = nsse16_mmx;
+        c->nsse[1]           = nsse8_mmx;
+#endif
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
+
+        c->sad[0] = ff_sad16_mmxext;
+        c->sad[1] = ff_sad8_mmxext;
+
+        c->pix_abs[0][0] = ff_sad16_mmxext;
+        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+        c->pix_abs[1][0] = ff_sad8_mmxext;
+        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+        c->vsad[4] = ff_vsad_intra16_mmxext;
+        c->vsad[5] = ff_vsad_intra8_mmxext;
+
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+            c->vsad[0] = ff_vsad16_approx_mmxext;
+            c->vsad[1] = ff_vsad8_approx_mmxext;
+        }
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->sse[0] = ff_sse16_sse2;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 
 #if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 #endif
+        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+            c->sad[0]        = ff_sad16_sse2;
+            c->pix_abs[0][0] = ff_sad16_sse2;
+            c->pix_abs[0][1] = ff_sad16_x2_sse2;
+            c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+            c->vsad[4]       = ff_vsad_intra16_sse2;
+            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+                c->vsad[0]       = ff_vsad16_approx_sse2;
+            }
+        }
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
     }
 }
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000000..3dc641e89e
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+   shlx %1, %1, %2q
+%else
+   shl  %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+    movdqa        m0, [samplesq]
+    movdqa        m1, [coeffsq ]
+    pshufd        m2, m0, q2301
+    pshufd        m3, m1, q2301
+    pmuldq        m0, m1
+    pmuldq        m3, m2
+    paddq         m0, m3
+%if notcpuflag(avx2)
+    movdqa        m1, [samplesq + 16]
+    movdqa        m2, [coeffsq  + 16]
+    pshufd        m3, m1, q2301
+    pshufd        m4, m2, q2301
+    pmuldq        m1, m2
+    pmuldq        m4, m3
+    paddq         m0, m1
+    paddq         m0, m4
+%else
+    vextracti128 xm1, m0, 1
+    paddq        xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    and       indexd, auspd                         ; index &= access_unit_size_pow2;
+    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+    add       indexd, index2d                       ; index += index2
+    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
+    add       accumq, noiseq                        ; accum += noise_buffer[index]
+    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, noised                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+;                             int index, unsigned int dest_ch, uint16_t blockpos,
+;                             unsigned int maxchan, int matrix_noise_shift,
+;                             int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+                                        index, dest_ch, blockpos, maxchan, mns, \
+                                        accum, mask, cnt
+    mov         mnsd, mnsm                          ; load matrix_noise_shift
+    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
+    mov     maxchand, maxchanm                      ; load maxchan
+    mov        maskd, maskm                         ; load mask
+%if WIN64
+    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
+%endif
+    shl     dest_chd, 2
+    lea         cntq, [blsbs_ptrq + blockposq*8]
+    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
+    jne .shift                                      ; jump if true
+    cmp     maxchand, 4                             ; is maxchan < 4?
+    jl .loop4                                       ; jump if true
+
+align 16
+.loop8:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_END
+    jne .loop8
+    RET
+
+align 16
+.loop4:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_END
+    jne .loop4
+    RET
+
+.shift:
+%if WIN64
+    mov       indexd, indexm         ; load index (not needed on UNIX64)
+%endif
+    mov          r9d, r9m            ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, accum, index2, mns, \
+                ausp, mask, cnt, noise
+    add         mnsd, 7              ; matrix_noise_shift += 7
+%else ; sse4
+    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+    ; r0 = rcx
+    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
+                index2, accum, ausp, mask, cnt, noise
+%else ; UNIX64
+    ; r3 = rcx
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
+                index2, accum, ausp, mask, cnt, noise
+%endif
+    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
+%endif ; cpuflag
+    sub        auspd, 1              ; access_unit_size_pow2 -= 1
+    cmp          r7d, 4              ; is maxchan < 4?
+    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+    jl .loop4_shift                  ; jump if maxchan < 4
+
+align 16
+.loop8_shift:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_SHIFT_END
+    jne .loop8_shift
+    RET
+
+align 16
+.loop4_shift:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_SHIFT_END
+    jne .loop4_shift
+    RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
index 72fc637764..7f5e6b11d5 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -2,32 +2,47 @@
  * MLP DSP functions x86-optimized
  * Copyright (c) 2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mlpdsp.h"
 #include "libavcodec/mlp.h"
 
-#if HAVE_7REGS && HAVE_INLINE_ASM
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
 
 extern char ff_mlp_firorder_8;
 extern char ff_mlp_firorder_7;
@@ -45,12 +60,12 @@ extern char ff_mlp_iirorder_2;
 extern char ff_mlp_iirorder_1;
 extern char ff_mlp_iirorder_0;
 
-static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
                                    &ff_mlp_firorder_2, &ff_mlp_firorder_3,
                                    &ff_mlp_firorder_4, &ff_mlp_firorder_5,
                                    &ff_mlp_firorder_6, &ff_mlp_firorder_7,
                                    &ff_mlp_firorder_8 };
-static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
                                    &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
                                    &ff_mlp_iirorder_4 };
 
@@ -133,8 +148,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         FIRMUL   (ff_mlp_firorder_6, 0x14   )
         FIRMUL   (ff_mlp_firorder_5, 0x10   )
         FIRMUL   (ff_mlp_firorder_4, 0x0c   )
-        FIRMULREG(ff_mlp_firorder_3, 0x08,10)
-        FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+        FIRMUL   (ff_mlp_firorder_3, 0x08   )
+        FIRMUL   (ff_mlp_firorder_2, 0x04   )
         FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
         LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
         "jmp  *%6                     \n\t"
@@ -163,8 +178,6 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
           /* 6*/"r"(iirjump)      , /* 7*/"c"(filter_shift)
         , /* 8*/"r"((int64_t)coeff[0])
-        , /* 9*/"r"((int64_t)coeff[1])
-        , /*10*/"r"((int64_t)coeff[2])
         : "rax", "rdx", "rsi"
 #else /* ARCH_X86_32 */
           /* 3*/"+m"(blocksize)
@@ -179,9 +192,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
 
 av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
 {
-#if HAVE_7REGS && HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     if (INLINE_MMX(cpu_flags))
         c->mlp_filter_channel = mlp_filter_channel_x86;
 #endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
 }
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 591f5270bd..9499141eea 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
  * SIMD-optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,11 +26,20 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if HAVE_YASM
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+#endif /* HAVE_YASM */
+
 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
                                float *tmpbuf);
 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -38,7 +47,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
 
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
-#if HAVE_SSE2_INLINE
+#if HAVE_6REGS && HAVE_SSE_INLINE
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -182,7 +191,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 
 #if HAVE_YASM
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
@@ -217,16 +226,22 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if HAVE_SSE
+#if ARCH_X86_32
 DECL_IMDCT_BLOCKS(sse,sse)
+#endif
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
 DECL_IMDCT_BLOCKS(avx,avx)
+#endif
 #endif /* HAVE_YASM */
 
 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
     int i, j;
     for (j = 0; j < 4; j++) {
@@ -242,16 +257,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
         }
     }
 
-#if HAVE_SSE2_INLINE
-    if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS && HAVE_SSE_INLINE
+    if (INLINE_SSE(cpu_flags)) {
         s->apply_window_float = apply_window_mp3;
     }
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE_INLINE */
 
 #if HAVE_YASM
+#if HAVE_SSE
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse;
     }
+#endif
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse2;
     }
@@ -261,8 +279,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_ssse3;
     }
+#endif
+#if HAVE_AVX_EXTERNAL
     if (EXTERNAL_AVX(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_avx;
     }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 6c0493e6b8..35a8264804 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
  * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 
 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
@@ -35,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
 
     qmul = qscale << 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     if (!s->h263_aic) {
         if (n < 4)
@@ -111,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
@@ -171,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -190,9 +191,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -208,7 +209,7 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
                 "psraw $3, %%mm0                \n\t"
                 "psraw $3, %%mm1                \n\t"
@@ -222,7 +223,7 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -239,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -253,9 +254,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -275,7 +276,7 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
                 "psraw $4, %%mm0                \n\t"
                 "psraw $4, %%mm1                \n\t"
@@ -289,7 +290,7 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -306,7 +307,10 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -325,9 +329,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -343,17 +347,17 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psraw $3, %%mm0                \n\t"
-                "psraw $3, %%mm1                \n\t"
+                "psraw $4, %%mm0                \n\t"
+                "psraw $4, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -371,7 +375,10 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -386,9 +393,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -408,10 +415,10 @@ __asm__ volatile(
                 "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psrlw $4, %%mm0                \n\t"
-                "psrlw $4, %%mm1                \n\t"
+                "psrlw $5, %%mm0                \n\t"
+                "psrlw $5, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
@@ -420,7 +427,7 @@ __asm__ volatile(
                 "pandn %%mm1, %%mm5             \n\t"
                 "pxor %%mm4, %%mm7              \n\t"
                 "pxor %%mm5, %%mm7              \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -442,11 +449,11 @@ __asm__ volatile(
         );
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
@@ -458,5 +465,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 }
diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c
index b701ef8cc7..e0498f3849 100644
--- a/libavcodec/x86/mpegvideodsp.c
+++ b/libavcodec/x86/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
 
 #if HAVE_INLINE_ASM
 
@@ -42,20 +43,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
     const uint64_t shift2  = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
     int x, y;
 
     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
     const int dxh = dxy * (h - 1);
     const int dyw = dyx * (w - 1);
+    int need_emu  =  (unsigned) ix >= width  - w ||
+                     (unsigned) iy >= height - h;
 
     if ( // non-constant fullpel offset (3% of blocks)
         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
         // uses more than 16 bits of subpel mv (only at huge resolution)
         (dxx | dxy | dyx | dyy) & 15 ||
-        (unsigned) ix >= width  - w ||
-        (unsigned) iy >= height - h) {
+        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
         // FIXME could still use mmx for some of the rows
         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                  shift, r, width, height);
@@ -63,6 +68,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     }
 
     src += ix + iy * stride;
+    if (need_emu) {
+        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
 
     __asm__ volatile (
         "movd         %0, %%mm6         \n\t"
@@ -149,4 +158,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
         c->gmc = gmc_mmx;
 #endif /* HAVE_INLINE_ASM */
 }
-
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 47349d17ec..67b26178a8 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
  * The simplest mpeg encoder (well, it was the simplest!)
  * Copyright (c) 2000,2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 
+#if HAVE_6REGS
+
 #if HAVE_MMX_INLINE
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2   0
@@ -81,7 +83,10 @@ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
+#endif /* HAVE_6REGS */
+
 #if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -135,7 +140,9 @@ static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
         : "r"(block+64)
     );
 }
+#endif /* HAVE_MMX_INLINE */
 
+#if HAVE_SSE2_INLINE
 static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -191,9 +198,10 @@ static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
                             "%xmm4", "%xmm5", "%xmm6", "%xmm7")
     );
 }
+#endif /* HAVE_SSE2_INLINE */
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
 {
     const int dct_algo = s->avctx->dct_algo;
     int i;
@@ -205,21 +213,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
 #if HAVE_MMX_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_mmx;
+#endif
             s->denoise_dct  = denoise_dct_mmx;
         }
 #endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
         if (INLINE_MMXEXT(cpu_flags))
             s->dct_quantize = dct_quantize_mmxext;
 #endif
 #if HAVE_SSE2_INLINE
         if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
+#endif
             s->denoise_dct  = denoise_dct_sse2;
         }
 #endif
-#if HAVE_SSSE3_INLINE
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c
index 8d8d68762a..882d486205 100644
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/libavcodec/x86/mpegvideoenc_qns_template.c
@@ -5,26 +5,26 @@
  * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
  * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
 #include <stdint.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/x86/asm.h"
 
@@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
 {
     x86_reg i=0;
 
-    assert(FFABS(scale) < MAX_ABS);
+    av_assert2(FFABS(scale) < MAX_ABS);
     scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
     SET_RND(mm6);
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 72df76b749..b2512744ca 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     const uint16_t *qmat, *bias;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
-    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+    av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
     //s->fdct (block);
     RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
@@ -118,10 +118,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 
     if (s->mb_intra) {
         int dummy;
-        if (n < 4)
+        if (n < 4){
             q = s->y_dc_scale;
-        else
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
             q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
         /* note: block[0] is assumed to be positive */
         if (!s->h263_aic) {
         __asm__ volatile (
@@ -136,8 +141,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
         last_non_zero_p1 = 1;
-        bias = s->q_intra_matrix16[qscale][1];
-        qmat = s->q_intra_matrix16[qscale][0];
     } else {
         last_non_zero_p1 = 0;
         bias = s->q_inter_matrix16[qscale][1];
@@ -173,7 +176,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"FF_REG_a"           \n\t"
-            "movzb %%al, %%"FF_REG_a"           \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -207,7 +210,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"FF_REG_a"           \n\t"
-            "movzb %%al, %%"FF_REG_a"           \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -221,7 +224,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         "psubusw "MM"1, "MM"4               \n\t"
         "packuswb "MM"4, "MM"4              \n\t"
 #if COMPILE_TEMPLATE_SSE2
-        "packuswb "MM"4, "MM"4              \n\t"
+        "packsswb "MM"4, "MM"4              \n\t"
 #endif
         "movd "MM"4, %0                     \n\t" // *overflow
         : "=g" (*overflow)
@@ -275,6 +278,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
         block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
         block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+        block[0x05] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19];
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
     }else{
         if(last_non_zero_p1 <= 1) goto end;
         block[0x01] = temp_block[0x01];
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee776d..aec73f82dc 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION .text
+SECTION_RODATA
 
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
     movsxdifnidn r1, r1d
-    mov          r2, r1
-    neg          r2
-    shl          r2, 4
-    sub          r0, r2
-    pxor         m7, m7
-    pxor         m6, m6
+    mov          r2, %1
+%if mmsize == 16
+    lea          r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+    pxor         m5, m5
+%endif
+    pxor         m4, m4
 .loop:
-    mova         m0, [r0+r2+0]
-    mova         m1, [r0+r2+0]
-    mova         m2, [r0+r2+8]
-    mova         m3, [r0+r2+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+%if cpuflag(xop)
+    vphaddubq    m0, [r0]
+    vphaddubq    m1, [r0+r1]
+    vphaddubq    m2, [r0+r1*2]
+    vphaddubq    m3, [r0+r3]
+%else
+    mova         m0, [r0]
+%if mmsize == 8
+    mova         m1, [r0+8]
+%if cpuflag(mmxext)
+    mova         m2, [r0+r1]
+    mova         m3, [r0+r1+8]
+%endif
+%else ; sse2
+    mova         m1, [r0+r1]
+    mova         m2, [r0+r1*2]
+    mova         m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+    psadbw       m0, m5
+    psadbw       m1, m5
+    psadbw       m2, m5
+    psadbw       m3, m5
+%else ; mmx
+    punpckhbw    m2, m0, m5
+    punpcklbw    m0, m5
+    punpckhbw    m3, m1, m5
+    punpcklbw    m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
     paddw        m1, m0
     paddw        m3, m2
     paddw        m3, m1
-    paddw        m6, m3
-    add          r2, r1
-    js .loop
-    mova         m5, m6
-    psrlq        m6, 32
-    paddw        m6, m5
-    mova         m5, m6
-    psrlq        m6, 16
-    paddw        m6, m5
-    movd        eax, m6
-    and         eax, 0xffff
+    paddw        m4, m3
+%if cpuflag(mmxext)
+    lea          r0, [r0+r1*%3]
+%else
+    add          r0, r1
+%endif
+    dec r2
+    jne .loop
+%if mmsize == 16
+    pshufd       m0, m4, q0032
+    paddd        m4, m0
+%elif notcpuflag(mmxext)
+    HADDW        m4, m5
+%endif
+    movd        eax, m4
     RET
+%endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16  8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16  4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16  4, 4, 4
+%endif
+
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
     movsxdifnidn r1, r1d
-    mov          r2, 16
+    mov          r2, %2
     pxor         m0, m0
-    pxor         m7, m7
+    pxor         m5, m5
 .loop:
     mova         m2, [r0+0]
+%if mmsize == 8
     mova         m3, [r0+8]
-    mova         m1, m2
-    punpckhbw    m1, m0
+%else
+    mova         m3, [r0+r1]
+%endif
+    punpckhbw    m1, m2, m0
     punpcklbw    m2, m0
-    mova         m4, m3
-    punpckhbw    m3, m0
-    punpcklbw    m4, m0
+    punpckhbw    m4, m3, m0
+    punpcklbw    m3, m0
     pmaddwd      m1, m1
     pmaddwd      m2, m2
     pmaddwd      m3, m3
     pmaddwd      m4, m4
     paddd        m2, m1
     paddd        m4, m3
-    paddd        m7, m2
+    paddd        m5, m2
+    paddd        m5, m4
+%if mmsize == 8
     add          r0, r1
-    paddd        m7, m4
+%else
+    lea          r0, [r0+r1*2]
+%endif
     dec r2
     jne .loop
-    mova         m1, m7
-    psrlq        m7, 32
-    paddd        m1, m7
-    movd        eax, m1
+    HADDD        m5, m1
+    movd        eax, m5
     RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
 
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 71fbf2874f..532836cec9 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -1,29 +1,34 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
@@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
-    } else {
+    } else if (w == 16) {
         __asm__ volatile (
             "1:                                 \n\t"
             "movd            (%0), %%mm0        \n\t"
@@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             "add               %1, %0           \n\t"
             "cmp               %3, %0           \n\t"
             "jb                1b               \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
@@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->pix_sum   = ff_pix_sum16_mmx;
         c->pix_norm1 = ff_pix_norm1_mmx;
     }
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_sse2;
+        c->pix_norm1   = ff_pix_norm1_sse2;
+    }
+
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_xop;
+    }
+
 #if HAVE_INLINE_ASM
 
     if (INLINE_MMX(cpu_flags)) {
diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm
index 871244297c..440fe29bcc 100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -50,7 +50,7 @@ cglobal get_pixels, 3,4
     REP_RET
 
 INIT_XMM sse2
-cglobal get_pixels, 3, 4
+cglobal get_pixels, 3, 4, 5
     lea          r3, [r2*3]
     pxor         m4, m4
     movh         m0, [r1]
@@ -80,28 +80,49 @@ cglobal get_pixels, 3, 4
     mova  [r0+0x70], m3
     RET
 
-INIT_MMX mmx
 ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
 ;                         ptrdiff_t stride);
-cglobal diff_pixels, 4,5
-    pxor         m7, m7
+%macro DIFF_PIXELS 0
+cglobal diff_pixels, 4,5,5
+    pxor         m4, m4
     add          r0,  128
     mov          r4, -128
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r2]
-    mova         m1, m0
-    mova         m3, m2
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+    movq         m0, [r1]
+    movq         m2, [r2]
+%if mmsize == 8
+    movq         m1, m0
+    movq         m3, m2
+    punpcklbw    m0, m4
+    punpckhbw    m1, m4
+    punpcklbw    m2, m4
+    punpckhbw    m3, m4
+%else
+    movq         m1, [r1+r3]
+    movq         m3, [r2+r3]
+    punpcklbw    m0, m4
+    punpcklbw    m1, m4
+    punpcklbw    m2, m4
+    punpcklbw    m3, m4
+%endif
     psubw        m0, m2
     psubw        m1, m3
     mova  [r0+r4+0], m0
-    mova  [r0+r4+8], m1
+    mova  [r0+r4+mmsize], m1
+%if mmsize == 8
     add          r1, r3
     add          r2, r3
-    add          r4, 16
+%else
+    lea          r1, [r1+r3*2]
+    lea          r2, [r2+r3*2]
+%endif
+    add          r4, 2 * mmsize
     jne .loop
-    REP_RET
+    RET
+%endmacro
+
+INIT_MMX mmx
+DIFF_PIXELS
+
+INIT_XMM sse2
+DIFF_PIXELS
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index faa5141327..fa9578a2d3 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * SIMD-optimized pixel operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         ptrdiff_t stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         ptrdiff_t stride);
 
 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
@@ -43,5 +45,6 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
     if (EXTERNAL_SSE2(cpu_flags)) {
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_sse2;
+        c->diff_pixels = ff_diff_pixels_sse2;
     }
 }
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 722caf0fd1..50e4255dec 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
     and                waq, ~(mmsize*2-1)
     jmp .end_v
 .loop_v:
-    mova                m0, [src1q+iq]
-    mova                m1, [src1q+iq+mmsize]
-    paddb               m0, [src2q+iq]
-    paddb               m1, [src2q+iq+mmsize]
-    mova  [dstq+iq       ], m0
-    mova  [dstq+iq+mmsize], m1
+    movu                m0, [src2q+iq]
+    movu                m1, [src2q+iq+mmsize]
+    paddb               m0, [src1q+iq]
+    paddb               m1, [src1q+iq+mmsize]
+    movu  [dstq+iq       ], m0
+    movu  [dstq+iq+mmsize], m1
     add                 iq, mmsize*2
 .end_v:
     cmp                 iq, waq
@@ -157,7 +157,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
     movh            [dstq], m3
     add               dstq, bppq
     cmp               dstq, endq
-    jle .loop
+    jl .loop
 
     mov               dstq, [rsp]
     dec              cntrq
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da36d7..7dca62c675 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
  * x86 PNG optimizations.
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 9613fa1448..16fc262aeb 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,427 +1,66 @@
 ;******************************************************************************
 ;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
 ;*
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
-%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
-%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
-%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
-%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
-%define W6sh2  8867 ; W6 = 35468 =  8867<<2
-%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
-
 %if ARCH_X86_64
 
 SECTION_RODATA
 
-w4_plus_w2: times 4 dw W4sh2, +W2sh2
-w4_min_w2:  times 4 dw W4sh2, -W2sh2
-w4_plus_w6: times 4 dw W4sh2, +W6sh2
-w4_min_w6:  times 4 dw W4sh2, -W6sh2
-w1_plus_w3: times 4 dw W1sh2, +W3sh2
-w3_min_w1:  times 4 dw W3sh2, -W1sh2
-w7_plus_w3: times 4 dw W7sh2, +W3sh2
-w3_min_w7:  times 4 dw W3sh2, -W7sh2
-w1_plus_w5: times 4 dw W1sh2, +W5sh2
-w5_min_w1:  times 4 dw W5sh2, -W1sh2
-w5_plus_w7: times 4 dw W5sh2, +W7sh2
-w7_min_w5:  times 4 dw W7sh2, -W5sh2
-row_round:  times 8 dw (1<<14)
-
+pw_88:      times 8 dw 0x2008
+cextern pw_1
 cextern pw_4
-cextern pw_8
-cextern pw_512
 cextern pw_1019
+; Below are defined in simple_idct10.asm built from selecting idctdsp
+cextern w4_plus_w2
+cextern w4_min_w2
+cextern w4_plus_w6
+cextern w4_min_w6
+cextern w1_plus_w3
+cextern w3_min_w1
+cextern w7_plus_w3
+cextern w3_min_w7
+cextern w1_plus_w5
+cextern w5_min_w1
+cextern w5_plus_w7
+cextern w7_min_w5
+
+%include "libavcodec/x86/simple_idct10_template.asm"
 
 SECTION .text
 
-; interleave data while maintaining source
-; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
-%macro SBUTTERFLY3 5
-    punpckl%1   m%2, m%4, m%5
-    punpckh%1   m%3, m%4, m%5
-%endmacro
-
-; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
-; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
-;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
-%macro SUMSUB_SHPK 7
-    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
-    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
-    paddd       %1,  %5            ; { a0 + b0 }[0-3]
-    paddd       %2,  %6            ; { a0 + b0 }[4-7]
-    psrad       %1,  %7
-    psrad       %2,  %7
-    psrad       %3,  %7
-    psrad       %4,  %7
-    packssdw    %1,  %2            ; row[0]
-    packssdw    %3,  %4            ; row[7]
-%endmacro
-
-; %1 = row or col (for rounding variable)
-; %2 = number of bits to shift at the end
-%macro IDCT_1D 2
-    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
-    ; a1 = a0;
-    ; a2 = a0;
-    ; a3 = a0;
-    ; a0 += W2 * row[2];
-    ; a1 += W6 * row[2];
-    ; a2 -= W6 * row[2];
-    ; a3 -= W2 * row[2];
-%ifidn %1, col
-    paddw       m10,[pw_8]
-%endif
-    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
-%ifidn %1, row
-    psubw       m10,[row_round]
-%endif
-    SIGNEXTEND  m8,  m9,  m14      ; { row[2] }[0-3] / [4-7]
-    SIGNEXTEND  m10, m11, m14      ; { row[0] }[0-3] / [4-7]
-    pmaddwd     m2,  m0, [w4_plus_w6]
-    pmaddwd     m3,  m1, [w4_plus_w6]
-    pmaddwd     m4,  m0, [w4_min_w6]
-    pmaddwd     m5,  m1, [w4_min_w6]
-    pmaddwd     m6,  m0, [w4_min_w2]
-    pmaddwd     m7,  m1, [w4_min_w2]
-    pmaddwd     m0, [w4_plus_w2]
-    pmaddwd     m1, [w4_plus_w2]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
-
-    ; a0: -1*row[0]-1*row[2]
-    ; a1: -1*row[0]
-    ; a2: -1*row[0]
-    ; a3: -1*row[0]+1*row[2]
-    psubd       m2,  m10           ; a1[0-3]
-    psubd       m3,  m11           ; a1[4-7]
-    psubd       m4,  m10           ; a2[0-3]
-    psubd       m5,  m11           ; a2[4-7]
-    psubd       m0,  m10
-    psubd       m1,  m11
-    psubd       m6,  m10
-    psubd       m7,  m11
-    psubd       m0,  m8            ; a0[0-3]
-    psubd       m1,  m9            ; a0[4-7]
-    paddd       m6,  m8            ; a3[0-3]
-    paddd       m7,  m9            ; a3[4-7]
-
-    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
-    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
-    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
-    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
-    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m14, m10      ; { row[4] }[0-3] / [4-7]
-    pmaddwd     m10, m8, [w4_plus_w6]
-    pmaddwd     m11, m9, [w4_plus_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10,  m13
-    psubd       m11,  m14
-    paddd       m0,  m10            ; a0[0-3]
-    paddd       m1,  m11            ; a0[4-7]
-    pmaddwd     m10, m8, [w4_min_w6]
-    pmaddwd     m11, m9, [w4_min_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10, m13
-    psubd       m11, m14
-    paddd       m6,  m10           ; a3[0-3]
-    paddd       m7,  m11           ; a3[4-7]
-    pmaddwd     m10, m8, [w4_min_w2]
-    pmaddwd     m11, m9, [w4_min_w2]
-    pmaddwd     m8, [w4_plus_w2]
-    pmaddwd     m9, [w4_plus_w2]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    psubd       m10, m13
-    psubd       m11, m14
-    psubd       m8,  m13
-    psubd       m9,  m14
-    psubd       m4,  m10           ; a2[0-3] intermediate
-    psubd       m5,  m11           ; a2[4-7] intermediate
-    psubd       m2,  m8            ; a1[0-3] intermediate
-    psubd       m3,  m9            ; a1[4-7] intermediate
-    SIGNEXTEND  m12, m13, m10      ; { row[6] }[0-3] / [4-7]
-    psubd       m4,  m12           ; a2[0-3]
-    psubd       m5,  m13           ; a2[4-7]
-    paddd       m2,  m12           ; a1[0-3]
-    paddd       m3,  m13           ; a1[4-7]
-
-    ; load/store
-    mova   [r2+  0], m0
-    mova   [r2+ 32], m2
-    mova   [r2+ 64], m4
-    mova   [r2+ 96], m6
-    mova        m10,[r2+ 16]       ; { row[1] }[0-7]
-    mova        m8, [r2+ 48]       ; { row[3] }[0-7]
-    mova        m13,[r2+ 80]       ; { row[5] }[0-7]
-    mova        m14,[r2+112]       ; { row[7] }[0-7]
-    mova   [r2+ 16], m1
-    mova   [r2+ 48], m3
-    mova   [r2+ 80], m5
-    mova   [r2+112], m7
-%ifidn %1, row
-    pmullw      m10,[r3+ 16]
-    pmullw      m8, [r3+ 48]
-    pmullw      m13,[r3+ 80]
-    pmullw      m14,[r3+112]
-%endif
-
-    ; b0 = MUL(W1, row[1]);
-    ; MAC(b0, W3, row[3]);
-    ; b1 = MUL(W3, row[1]);
-    ; MAC(b1, -W7, row[3]);
-    ; b2 = MUL(W5, row[1]);
-    ; MAC(b2, -W1, row[3]);
-    ; b3 = MUL(W7, row[1]);
-    ; MAC(b3, -W5, row[3]);
-    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
-    SIGNEXTEND  m10, m11, m12      ; { row[1] }[0-3] / [4-7]
-    SIGNEXTEND  m8,  m9,  m12      ; { row[3] }[0-3] / [4-7]
-    pmaddwd     m2,  m0, [w3_min_w7]
-    pmaddwd     m3,  m1, [w3_min_w7]
-    pmaddwd     m4,  m0, [w5_min_w1]
-    pmaddwd     m5,  m1, [w5_min_w1]
-    pmaddwd     m6,  m0, [w7_min_w5]
-    pmaddwd     m7,  m1, [w7_min_w5]
-    pmaddwd     m0, [w1_plus_w3]
-    pmaddwd     m1, [w1_plus_w3]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
-
-    ; b0: +1*row[1]+2*row[3]
-    ; b1: +2*row[1]-1*row[3]
-    ; b2: -1*row[1]-1*row[3]
-    ; b3: +1*row[1]+1*row[3]
-    psubd       m2,  m8
-    psubd       m3,  m9
-    paddd       m0,  m8
-    paddd       m1,  m9
-    paddd       m8,  m10           ; { row[1] + row[3] }[0-3]
-    paddd       m9,  m11           ; { row[1] + row[3] }[4-7]
-    paddd       m10, m10
-    paddd       m11, m11
-    paddd       m0,  m8            ; b0[0-3]
-    paddd       m1,  m9            ; b0[4-7]
-    paddd       m2,  m10           ; b1[0-3]
-    paddd       m3,  m11           ; b2[4-7]
-    psubd       m4,  m8            ; b2[0-3]
-    psubd       m5,  m9            ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
-
-    ; MAC(b0,  W5, row[5]);
-    ; MAC(b0,  W7, row[7]);
-    ; MAC(b1, -W1, row[5]);
-    ; MAC(b1, -W5, row[7]);
-    ; MAC(b2,  W7, row[5]);
-    ; MAC(b2,  W3, row[7]);
-    ; MAC(b3,  W3, row[5]);
-    ; MAC(b3, -W1, row[7]);
-    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m12, m11      ; { row[5] }[0-3] / [4-7]
-    SIGNEXTEND  m14, m11, m10      ; { row[7] }[0-3] / [4-7]
-
-    ; b0: -1*row[5]+1*row[7]
-    ; b1: -1*row[5]+1*row[7]
-    ; b2: +1*row[5]+2*row[7]
-    ; b3: +2*row[5]-1*row[7]
-    paddd       m4,  m13
-    paddd       m5,  m12
-    paddd       m6,  m13
-    paddd       m7,  m12
-    psubd       m13, m14           ; { row[5] - row[7] }[0-3]
-    psubd       m12, m11           ; { row[5] - row[7] }[4-7]
-    paddd       m14, m14
-    paddd       m11, m11
-    psubd       m0,  m13
-    psubd       m1,  m12
-    psubd       m2,  m13
-    psubd       m3,  m12
-    paddd       m4,  m14
-    paddd       m5,  m11
-    paddd       m6,  m13
-    paddd       m7,  m12
-
-    pmaddwd     m10, m8, [w1_plus_w5]
-    pmaddwd     m11, m9, [w1_plus_w5]
-    pmaddwd     m12, m8, [w5_plus_w7]
-    pmaddwd     m13, m9, [w5_plus_w7]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m12,  2
-    pslld       m13,  2
-    psubd       m2,  m10           ; b1[0-3]
-    psubd       m3,  m11           ; b1[4-7]
-    paddd       m0,  m12            ; b0[0-3]
-    paddd       m1,  m13            ; b0[4-7]
-    pmaddwd     m12, m8, [w7_plus_w3]
-    pmaddwd     m13, m9, [w7_plus_w3]
-    pmaddwd     m8, [w3_min_w1]
-    pmaddwd     m9, [w3_min_w1]
-    pslld       m12, 2
-    pslld       m13, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    paddd       m4,  m12           ; b2[0-3]
-    paddd       m5,  m13           ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
-
-    ; row[0] = (a0 + b0) >> 15;
-    ; row[7] = (a0 - b0) >> 15;
-    ; row[1] = (a1 + b1) >> 15;
-    ; row[6] = (a1 - b1) >> 15;
-    ; row[2] = (a2 + b2) >> 15;
-    ; row[5] = (a2 - b2) >> 15;
-    ; row[3] = (a3 + b3) >> 15;
-    ; row[4] = (a3 - b3) >> 15;
-    mova        m8, [r2+ 0]        ; a0[0-3]
-    mova        m9, [r2+16]        ; a0[4-7]
-    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
-    mova        m0, [r2+32]        ; a1[0-3]
-    mova        m1, [r2+48]        ; a1[4-7]
-    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
-    mova        m1, [r2+64]        ; a2[0-3]
-    mova        m2, [r2+80]        ; a2[4-7]
-    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
-    mova        m2, [r2+96]        ; a3[0-3]
-    mova        m3, [r2+112]       ; a3[4-7]
-    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
-%endmacro
-
-; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t linesize,
-;                                  int16_t *block, const int16_t *qmat);
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
-    pxor        m15, m15           ; zero
-
-    ; for (i = 0; i < 8; i++)
-    ;     idctRowCondDC(block + i*8);
-    mova        m10,[r2+ 0]        ; { row[0] }[0-7]
-    mova        m8, [r2+32]        ; { row[2] }[0-7]
-    mova        m13,[r2+64]        ; { row[4] }[0-7]
-    mova        m12,[r2+96]        ; { row[6] }[0-7]
-
-    pmullw      m10,[r3+ 0]
-    pmullw      m8, [r3+32]
-    pmullw      m13,[r3+64]
-    pmullw      m12,[r3+96]
-
-    IDCT_1D     row, 17
-
-    ; transpose for second part of IDCT
-    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
-    mova   [r2+ 16], m0
-    mova   [r2+ 48], m2
-    mova   [r2+ 80], m11
-    mova   [r2+112], m10
-    SWAP         8,  10
-    SWAP         1,   8
-    SWAP         4,  13
-    SWAP         9,  12
-
-    ; for (i = 0; i < 8; i++)
-    ;     idctSparseColAdd(dest + i, line_size, block + i);
-    IDCT_1D     col, 20
-
-    ; clip/store
-    mova        m6, [pw_512]
-    mova        m3, [pw_4]
-    mova        m5, [pw_1019]
-    paddw       m8,  m6
-    paddw       m0,  m6
-    paddw       m1,  m6
-    paddw       m2,  m6
-    paddw       m4,  m6
-    paddw       m11, m6
-    paddw       m9,  m6
-    paddw       m10, m6
-    pmaxsw      m8,  m3
-    pmaxsw      m0,  m3
-    pmaxsw      m1,  m3
-    pmaxsw      m2,  m3
-    pmaxsw      m4,  m3
-    pmaxsw      m11, m3
-    pmaxsw      m9,  m3
-    pmaxsw      m10, m3
-    pminsw      m8,  m5
-    pminsw      m0,  m5
-    pminsw      m1,  m5
-    pminsw      m2,  m5
-    pminsw      m4,  m5
-    pminsw      m11, m5
-    pminsw      m9,  m5
-    pminsw      m10, m5
-
-    lea         r2, [r1*3]
-    mova  [r0     ], m8
-    mova  [r0+r1  ], m0
-    mova  [r0+r1*2], m1
-    mova  [r0+r2  ], m2
-    lea         r0, [r0+r1*4]
-    mova  [r0     ], m4
-    mova  [r0+r1  ], m11
-    mova  [r0+r1*2], m9
-    mova  [r0+r2  ], m10
+%macro idct_fn 0
+cglobal prores_idct_put_10, 4, 4, 15
+    IDCT_FN    pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
     RET
 %endmacro
 
-%macro SIGNEXTEND 2-3
-%if cpuflag(sse4) ; dstlow, dsthigh
-    movhlps     %2,  %1
-    pmovsxwd    %1,  %1
-    pmovsxwd    %2,  %2
-%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
-    pxor        %3,  %3
-    pcmpgtw     %3,  %1
-    mova        %2,  %1
-    punpcklwd   %1,  %3
-    punpckhwd   %2,  %3
-%endif
-%endmacro
-
 INIT_XMM sse2
-idct_put_fn 16
-INIT_XMM sse4
-idct_put_fn 16
+idct_fn
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_fn
+%endif
 
 %endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index ff4d39836b..8ca4d4d9b3 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,10 @@
 
 void ff_prores_idct_put_10_sse2(uint16_t *dst, ptrdiff_t linesize,
                                 int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_sse4(uint16_t *dst, ptrdiff_t linesize,
-                                int16_t *block, const int16_t *qmat);
 void ff_prores_idct_put_10_avx (uint16_t *dst, ptrdiff_t linesize,
                                 int16_t *block, const int16_t *qmat);
 
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
@@ -42,11 +40,6 @@ av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
         dsp->idct_put = ff_prores_idct_put_10_sse2;
     }
 
-    if (EXTERNAL_SSE4(cpu_flags)) {
-        dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
-        dsp->idct_put = ff_prores_idct_put_10_sse4;
-    }
-
     if (EXTERNAL_AVX(cpu_flags)) {
         dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
         dsp->idct_put = ff_prores_idct_put_10_avx;
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63b8a..4e72d5084f 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index ef5f1d8826..282faed14f 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -1,22 +1,23 @@
 ;******************************************************************************
-;* quarterpel DSP functions
-;*
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index cdefe50a3c..3268d907ab 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
                                                 const uint8_t *src,
                                                 int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
 
 #if HAVE_YASM
 
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
 
 #define QPEL_OP(OPNAME, RND, MMX)                                       \
 static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index 0c76d91647..09946bd23f 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "inline_asm.h"
 
 // put_pixels
-STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -46,12 +46,12 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddusw %%mm0, %%mm4           \n\t"
         "paddusw %%mm1, %%mm5           \n\t"
-        "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
         "add    %3, %1                  \n\t"
         ".p2align 3                     \n\t"
         "1:                             \n\t"
-        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
-        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
         "movq   %%mm0, %%mm1            \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "punpcklbw %%mm7, %%mm0         \n\t"
@@ -67,11 +67,11 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "psrlw  $2, %%mm4               \n\t"
         "psrlw  $2, %%mm5               \n\t"
         "packuswb  %%mm5, %%mm4         \n\t"
-        "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
-        "add  %3, %%"FF_REG_a"          \n\t"
+        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
 
-        "movq  (%1, %%"FF_REG_a"), %%mm2\n\t" // 0 <-> 2   1 <-> 3
-        "movq 1(%1, %%"FF_REG_a"), %%mm4\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "movq   %%mm4, %%mm5            \n\t"
         "punpcklbw %%mm7, %%mm2         \n\t"
@@ -87,8 +87,8 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "psrlw  $2, %%mm0               \n\t"
         "psrlw  $2, %%mm1               \n\t"
         "packuswb  %%mm1, %%mm0         \n\t"
-        "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
-        "add  %3, %%"FF_REG_a"          \n\t"
+        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
 
         "subl   $2, %0                  \n\t"
         "jnz    1b                      \n\t"
@@ -99,7 +99,7 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
 
 // avg_pixels
 // this routine is 'slightly' suboptimal but mostly unused
-STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -115,12 +115,12 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddusw %%mm0, %%mm4           \n\t"
         "paddusw %%mm1, %%mm5           \n\t"
-        "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
         "add    %3, %1                  \n\t"
         ".p2align 3                     \n\t"
         "1:                             \n\t"
-        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
-        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
         "movq   %%mm0, %%mm1            \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "punpcklbw %%mm7, %%mm0         \n\t"
@@ -135,16 +135,16 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "paddusw %%mm1, %%mm5           \n\t"
         "psrlw  $2, %%mm4               \n\t"
         "psrlw  $2, %%mm5               \n\t"
-        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
         "packuswb  %%mm5, %%mm4         \n\t"
                 "pcmpeqd %%mm2, %%mm2   \n\t"
                 "paddb %%mm2, %%mm2     \n\t"
                 PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
-        "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
-        "add %3, %%"FF_REG_a"           \n\t"
+                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
 
-        "movq  (%1, %%"FF_REG_a"), %%mm2\n\t" // 0 <-> 2   1 <-> 3
-        "movq 1(%1, %%"FF_REG_a"), %%mm4\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "movq   %%mm4, %%mm5            \n\t"
         "punpcklbw %%mm7, %%mm2         \n\t"
@@ -159,13 +159,13 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "paddusw %%mm5, %%mm1           \n\t"
         "psrlw  $2, %%mm0               \n\t"
         "psrlw  $2, %%mm1               \n\t"
-        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
         "packuswb  %%mm1, %%mm0         \n\t"
                 "pcmpeqd %%mm2, %%mm2   \n\t"
                 "paddb %%mm2, %%mm2     \n\t"
                 PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
-        "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
-        "add  %3, %%"FF_REG_a"          \n\t"
+                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
 
         "subl   $2, %0                  \n\t"
         "jnz    1b                      \n\t"
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b600..692b4acfcd 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -64,6 +64,7 @@ rv34_idct dc
 rv34_idct dc_noround
 
 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+%if ARCH_X86_32
 INIT_MMX mmx
 cglobal rv34_idct_dc_add, 3, 3
     ; calculate DC
@@ -97,6 +98,7 @@ cglobal rv34_idct_dc_add, 3, 3
     movh       [r2], m4
     movh       [r2+r1], m5
     RET
+%endif
 
 ; Load coeffs and perform row transform
 ; Output: coeffs in mm[0467], rounder in mm5
@@ -167,7 +169,7 @@ cglobal rv34_idct_add, 3,3,0, d, s, b
     ret
 
 ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
-INIT_XMM sse4
+%macro RV34_IDCT_DC_ADD 0
 cglobal rv34_idct_dc_add, 3, 3, 6
     ; load data
     IDCT_DC_ROUND r2
@@ -190,7 +192,22 @@ cglobal rv34_idct_dc_add, 3, 3, 6
     paddw      m4, m0
     packuswb   m2, m4
     movd      [r0], m2
+%if cpuflag(sse4)
     pextrd [r0+r1], m2, 1
     pextrd    [r2], m2, 2
     pextrd [r2+r1], m2, 3
+%else
+    psrldq     m2, 4
+    movd   [r0+r1], m2
+    psrldq     m2, 4
+    movd      [r2], m2
+    psrldq     m2, 4
+    movd   [r2+r1], m2
+%endif
     RET
+%endmacro
+
+INIT_XMM sse2
+RV34_IDCT_DC_ADD
+INIT_XMM sse4
+RV34_IDCT_DC_ADD
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 32d4c1aac2..7310122458 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
  * RV30/40 MMX/SSE2 optimizations
  * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 void ff_rv34_idct_dc_mmxext(int16_t *block);
 void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
 void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc);
 void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
 void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
 
@@ -34,12 +35,14 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags))
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags))
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
         c->rv34_idct_add         = ff_rv34_idct_add_mmxext;
     }
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2;
     if (EXTERNAL_SSE4(cpu_flags))
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
 }
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 77f6ddb25d..d0c3af0f8d 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index f6d4165452..340173d063 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions x86-optimised
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,13 @@
 #include "libavutil/x86/cpu.h"
 #include "hpeldsp.h"
 
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+                                               ptrdiff_t stride) \
+{ \
+    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
 #if HAVE_YASM
 void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
                                   ptrdiff_t stride, int h, int x, int y);
@@ -75,7 +82,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst,  \
 {                                                                       \
     int i;                                                              \
     if (PH && PV) {                                                     \
-        DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)];           \
+        LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]);           \
         uint8_t *tmpptr = tmp + SIZE * 2;                               \
         src -= stride * 2;                                              \
                                                                         \
@@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT)
 /** @} */
 
 #define LOOPSIZE  8
-#define HCOFF(x)  (32 * (x - 1))
-#define VCOFF(x)  (32 * (x - 1))
+#define HCOFF(x)  (32 * ((x) - 1))
+#define VCOFF(x)  (32 * ((x) - 1))
 QPEL_MC_DECL(put_, _ssse3)
 QPEL_MC_DECL(avg_, _ssse3)
 
@@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  8
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 QPEL_MC_DECL(put_, _sse2)
 QPEL_MC_DECL(avg_, _sse2)
 
@@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  4
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 
 QPEL_MC_DECL(put_, _mmx)
 
@@ -186,34 +193,28 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
 QPEL_FUNCS_SET (OP, 3, 2, OPT)
 /** @} */
 
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
 #endif /* HAVE_YASM */
 
 #if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
 
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_MMX_INLINE
     if (INLINE_MMX(cpu_flags)) {
@@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
@@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
@@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(avg_, _sse2)
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
+        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_ssse3;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index b449de5f9a..07a412b2ae 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
 ;* AAC Spectral Band Replication decoding functions
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,7 +25,14 @@ SECTION_RODATA
 ; mask equivalent for multiply by -1.0 1.0
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
-ps_neg          times 4 dd 1<<31
+ps_mask3        dd  0, 0, 0, 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
+cextern         ps_neg
 
 SECTION .text
 
@@ -136,7 +143,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
     mova       m3, m1
     mova       m4, m2
-    mova       m7, [ps_mask]
 
     ; Set pointers
 %if ARCH_X86_64 == 0 || WIN64
@@ -156,30 +162,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     shl      start, 3            ; offset from num loops
 
     mova        m0, [X_lowq + start]
-    movlhps     m1, m1           ; (a2 a3 a2 a3)
-    movlhps     m2, m2           ; (a0 a1 a0 a1)
-    shufps      m3, m3, q0101    ; (a3 a2 a3 a2)
-    shufps      m4, m4, q0101    ; (a1 a0 a1 a0)
-    xorps       m3, m7           ; (-a3 a2 -a3 a2)
-    xorps       m4, m7           ; (-a1 a0 -a1 a0)
+    shufps      m3, m3, q1111
+    shufps      m4, m4, q1111
+    xorps       m3, [ps_mask]
+    shufps      m1, m1, q0000
+    shufps      m2, m2, q0000
+    xorps       m4, [ps_mask]
 .loop2:
-    mova        m5, m0
+    movu        m7, [X_lowq + start + 8]        ; BbCc
     mova        m6, m0
-    shufps      m0, m0, q2200    ; {Xl[-2][0],",Xl[-1][0],"}
-    shufps      m5, m5, q3311    ; {Xl[-2][1],",Xl[-1][1],"}
-    mulps       m0, m2
-    mulps       m5, m4
-    mova        m7, m6
-    addps       m5, m0
-    mova        m0, [X_lowq + start + 2*2*4]
-    shufps      m6, m0, q0022    ; {Xl[-1][0],",Xl[0][0],"}
-    shufps      m7, m0, q1133    ; {Xl[-1][1],",Xl[1][1],"}
-    mulps       m6, m1
+    mova        m5, m7
+    shufps      m0, m0, q2301                   ; aAbB
+    shufps      m7, m7, q2301                   ; bBcC
+    mulps       m0, m4
     mulps       m7, m3
-    addps       m5, m6
+    mulps       m6, m2
+    mulps       m5, m1
+    addps       m7, m0
+    mova        m0, [X_lowq + start +16]        ; CcDd
     addps       m7, m0
-    addps       m5, m7
-    mova  [X_highq + start], m5
+    addps       m6, m5
+    addps       m7, m6
+    mova  [X_highq + start], m7
     add     start, 16
     jnz         .loop2
     RET
@@ -246,33 +250,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
     jne      .loop
     REP_RET
 
-INIT_XMM sse2
 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY  0
 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
     mov               cq, 64*4-2*mmsize
     lea            vrevq, [vq + 64*4]
 .loop:
     mova              m0, [src0q+cq]
     mova              m1, [src1q]
-    mova              m2, [src0q+cq+mmsize]
-    mova              m3, [src1q+mmsize]
-    pshufd            m4, m0, q0123
-    pshufd            m5, m1, q0123
-    pshufd            m6, m2, q0123
-    pshufd            m7, m3, q0123
-    addps             m3, m4
+    mova              m4, [src0q+cq+mmsize]
+    mova              m5, [src1q+mmsize]
+%if cpuflag(sse2)
+    pshufd            m2, m0, q0123
+    pshufd            m3, m1, q0123
+    pshufd            m6, m4, q0123
+    pshufd            m7, m5, q0123
+%else
+    shufps            m2, m0, m0, q0123
+    shufps            m3, m1, m1, q0123
+    shufps            m6, m4, m4, q0123
+    shufps            m7, m5, m5, q0123
+%endif
+    addps             m5, m2
     subps             m0, m7
     addps             m1, m6
-    subps             m2, m5
+    subps             m4, m3
     mova         [vrevq], m1
-    mova  [vrevq+mmsize], m3
+    mova  [vrevq+mmsize], m5
     mova         [vq+cq], m0
-    mova  [vq+cq+mmsize], m2
+    mova  [vq+cq+mmsize], m4
     add            src1q, 2*mmsize
     add            vrevq, 2*mmsize
     sub               cq, 2*mmsize
     jge            .loop
     REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
 
 INIT_XMM sse2
 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +321,228 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
     movq       m2, [zq]
     movq    [r2q], m2
     REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST  1
+%ifdef PIC
+    lea  NOISE_TABLE, [%1]
+    mova          m0, [kxq + NOISE_TABLE]
+%else
+    mova          m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise0]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise2]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+    mov       kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+    movsxdifnidn    noiseq, noised
+    dec    noiseq
+    shl    count, 2
+%ifdef PIC
+    lea NOISE_TABLE, [sbr_noise_table]
+%endif
+    lea        Yq, [Yq + 2*count]
+    add      s_mq, count
+    add   q_filtq, count
+    shl    noiseq, 3
+    pxor       m5, m5
+    neg    count
+.loop:
+    mova       m1, [q_filtq + count]
+    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
+    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
+    add    noiseq, 2*mmsize
+    and    noiseq, 0x1ff<<3
+    punpckhdq  m2, m1, m1
+    punpckldq  m1, m1
+    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mova       m3, [s_mq + count]
+    ; TODO: replace by a vpermd in AVX2
+    punpckhdq  m4, m3, m3
+    punpckldq  m3, m3
+    pcmpeqd    m6, m3, m5 ; m6 == 0
+    pcmpeqd    m7, m4, m5 ; m7 == 0
+    mulps      m3, m0 ; s_m[m] * phi_sign
+    mulps      m4, m0 ; s_m[m] * phi_sign
+    pand       m1, m6
+    pand       m2, m7
+    movu       m6, [Yq + 2*count]
+    movu       m7, [Yq + 2*count + mmsize]
+    addps      m3, m1
+    addps      m4, m2
+    addps      m6, m3
+    addps      m7, m4
+    movu    [Yq + 2*count], m6
+    movu    [Yq + 2*count + mmsize], m7
+    add    count, mmsize
+    jl      .loop
+    RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT  32*4
+%define OFFSET 32*4
+    mov        cq, -COUNT
+    lea     vrevq, [vq + OFFSET + COUNT]
+    add        vq, OFFSET-mmsize
+    add      srcq, 2*COUNT
+    mova       m3, [ps_neg]
+.loop:
+    mova       m0, [srcq + 2*cq + 0*mmsize]
+    mova       m1, [srcq + 2*cq + 1*mmsize]
+    shufps     m2, m0, m1, q2020
+    shufps     m1, m0, q1313
+    xorps      m2, m3
+    mova     [vq], m1
+    mova  [vrevq + cq], m2
+    sub        vq, mmsize
+    add        cq, mmsize
+    jl      .loop
+    REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+    mov   cntq, 37*8
+    add     xq, cntq
+    neg   cntq
+
+%if cpuflag(sse3)
+%define   MOVH  movsd
+    movddup m5, [xq+cntq]
+%else
+%define   MOVH  movlps
+    movlps  m5, [xq+cntq]
+    movlhps m5, m5
+%endif
+    MOVH    m7, [xq+cntq+8 ]
+    MOVH    m1, [xq+cntq+16]
+    shufps  m7, m7, q0110
+    shufps  m1, m1, q0110
+    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
+    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+    movaps  [rsp   ], m3
+    movaps  [rsp+16], m4
+    add   cntq, 8
+
+    MOVH    m2, [xq+cntq+16]
+    movlhps m7, m7
+    shufps  m2, m2, q0110
+    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+    mulps   m4, m7, m2
+    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
+    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+    add   cntq, 8
+    MOVH    m0, [xq+cntq+16]
+    movlhps m1, m1
+    shufps  m0, m0, q0110
+    mulps   m3, m1, m2
+    mulps   m4, m1, m0
+    mulps   m1, m1
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m1, [xq+cntq+16]
+    movlhps m2, m2
+    shufps  m1, m1, q0110
+    mulps   m3, m2, m0
+    mulps   m4, m2, m1
+    mulps   m2, m2
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m2, [xq+cntq+16]
+    movlhps m0, m0
+    shufps  m2, m2, q0110
+    mulps   m3, m0, m1
+    mulps   m4, m0, m2
+    mulps   m0, m0
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    jl .loop
+
+    movlhps m1, m1
+    mulps   m2, m1
+    mulps   m1, m1
+    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+    xorps   m2, [ps_mask3]
+    xorps   m5, [ps_mask3]
+    xorps   m6, [ps_mask3]
+    HADDPS  m2, m5, m3
+    HADDPS  m7, m6, m4
+%if cpuflag(sse3)
+    movshdup m0, m1
+%else
+    movss   m0, m1
+    shufps  m1, m1, q0001
+%endif
+    addss   m1, m0
+    movaps  [phiq     ], m2
+    movhps  [phiq+0x18], m7
+    movss   [phiq+0x28], m7
+    movss   [phiq+0x10], m1
+    RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852163..6911a1a515 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding functions
  * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,28 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
                        float bw, int start, int end);
 void ff_sbr_neg_odd_64_sse(float *z);
 void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_pre_shuffle_sse2(float *z);
 
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
 av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,10 +67,21 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->hf_g_filt  = ff_sbr_hf_g_filt_sse;
         s->hf_gen     = ff_sbr_hf_gen_sse;
         s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
+        s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
+        s->autocorrelate    = ff_sbr_autocorrelate_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse2;
         s->qmf_pre_shuffle  = ff_sbr_qmf_pre_shuffle_sse2;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
+    }
+
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->autocorrelate = ff_sbr_autocorrelate_sse3;
     }
 }
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 0939a49a3a..d3a19fa60d 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -3,24 +3,23 @@
  *
  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 
@@ -86,7 +85,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
 
 static inline void idct(int16_t *block)
 {
-        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
         int16_t * const temp= (int16_t*)align_tmp;
 
         __asm__ volatile(
@@ -893,6 +892,7 @@ Temp
 
 "9: \n\t"
                 :: "r" (block), "r" (temp), "r" (coeffs)
+                   NAMED_CONSTRAINTS_ADD(wm1010,d40000)
                 : "%eax"
         );
 }
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 15784a9501..ad76bafd85 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,4 +26,16 @@ void ff_simple_idct_mmx(int16_t *block);
 void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct10_sse2(int16_t *block);
+void ff_simple_idct10_avx(int16_t *block);
+
+void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct12_sse2(int16_t *block);
+void ff_simple_idct12_avx(int16_t *block);
+
+void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
 #endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
new file mode 100644
index 0000000000..5dee533de0
--- /dev/null
+++ b/libavcodec/x86/simple_idct10.asm
@@ -0,0 +1,100 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2015 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+cextern pw_2
+cextern pw_16
+cextern pw_1023
+cextern pw_4095
+pd_round_12: times 4 dd 1<<(12-1)
+pd_round_15: times 4 dd 1<<(15-1)
+pd_round_19: times 4 dd 1<<(19-1)
+
+%macro CONST_DEC  3
+const %1
+times 4 dw %2, %3
+%endmacro
+
+%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
+%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
+%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
+%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
+%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
+%define W6sh2  8867 ; W6 = 35468 =  8867<<2
+%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
+
+CONST_DEC  w4_plus_w2,   W4sh2, +W2sh2
+CONST_DEC  w4_min_w2,    W4sh2, -W2sh2
+CONST_DEC  w4_plus_w6,   W4sh2, +W6sh2
+CONST_DEC  w4_min_w6,    W4sh2, -W6sh2
+CONST_DEC  w1_plus_w3,   W1sh2, +W3sh2
+CONST_DEC  w3_min_w1,    W3sh2, -W1sh2
+CONST_DEC  w7_plus_w3,   W7sh2, +W3sh2
+CONST_DEC  w3_min_w7,    W3sh2, -W7sh2
+CONST_DEC  w1_plus_w5,   W1sh2, +W5sh2
+CONST_DEC  w5_min_w1,    W5sh2, -W1sh2
+CONST_DEC  w5_plus_w7,   W5sh2, +W7sh2
+CONST_DEC  w7_min_w5,    W7sh2, -W5sh2
+
+%include "libavcodec/x86/simple_idct10_template.asm"
+
+SECTION .text
+
+%macro idct_fn 0
+cglobal simple_idct10, 1, 1, 16
+    IDCT_FN    "", 12, "", 19
+    RET
+
+cglobal simple_idct10_put, 3, 3, 16
+    IDCT_FN    "", 12, "", 19, 0, pw_1023
+    RET
+
+cglobal simple_idct12, 1, 1, 16
+    ; coeffs are already 15bits, adding the offset would cause
+    ; overflow in the input
+    IDCT_FN    "", 15, pw_2, 16
+    RET
+
+cglobal simple_idct12_put, 3, 3, 16
+    ; range isn't known, so the C simple_idct range is used
+    ; Also, using a bias on input overflows, so use the bias
+    ; on output of the first butterfly instead
+    IDCT_FN    "", 15, pw_2, 16, 0, pw_4095
+    RET
+%endmacro
+
+INIT_XMM sse2
+idct_fn
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+idct_fn
+%endif
+
+%endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
new file mode 100644
index 0000000000..9d323d99b3
--- /dev/null
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -0,0 +1,315 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; add SECTION_RODATA and proper include before including this file!
+
+%if ARCH_X86_64
+
+; interleave data while maintaining source
+; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
+%macro SBUTTERFLY3 5
+    punpckl%1   m%2, m%4, m%5
+    punpckh%1   m%3, m%4, m%5
+%endmacro
+
+; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
+; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
+;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
+%macro SUMSUB_SHPK 7
+    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
+    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
+    paddd       %1,  %5            ; { a0 + b0 }[0-3]
+    paddd       %2,  %6            ; { a0 + b0 }[4-7]
+    psrad       %1,  %7
+    psrad       %2,  %7
+    psrad       %3,  %7
+    psrad       %4,  %7
+    packssdw    %1,  %2            ; row[0]
+    packssdw    %3,  %4            ; row[7]
+%endmacro
+
+; %1 = initial bias ("" if nop)
+; %2 = number of bits to shift at the end
+; %3 = qmat (for prores)
+%macro IDCT_1D 2-3
+    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
+    ; a1 = a0;
+    ; a2 = a0;
+    ; a3 = a0;
+    ; a0 += W2 * row[2];
+    ; a1 += W6 * row[2];
+    ; a2 -= W6 * row[2];
+    ; a3 -= W2 * row[2];
+%ifstr %1
+    mova        m15, [pd_round_ %+ %2]
+%else
+    paddw       m10, [%1]
+%endif
+    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
+    pmaddwd     m2,  m0, [w4_plus_w6]
+    pmaddwd     m3,  m1, [w4_plus_w6]
+    pmaddwd     m4,  m0, [w4_min_w6]
+    pmaddwd     m5,  m1, [w4_min_w6]
+    pmaddwd     m6,  m0, [w4_min_w2]
+    pmaddwd     m7,  m1, [w4_min_w2]
+    pmaddwd     m0, [w4_plus_w2]
+    pmaddwd     m1, [w4_plus_w2]
+%ifstr %1
+    ; Adding 1<<(%2-1) for >=15 bits values
+    paddd       m2, m15
+    paddd       m3, m15
+    paddd       m4, m15
+    paddd       m5, m15
+    paddd       m6, m15
+    paddd       m7, m15
+    paddd       m0, m15
+    paddd       m1, m15
+%endif
+
+    ; a0: -1*row[0]-1*row[2]
+    ; a1: -1*row[0]
+    ; a2: -1*row[0]
+    ; a3: -1*row[0]+1*row[2]
+
+    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
+    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
+    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
+    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
+    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
+    pmaddwd     m10, m8, [w4_plus_w6]
+    pmaddwd     m11, m9, [w4_plus_w6]
+    paddd       m0,  m10            ; a0[0-3]
+    paddd       m1,  m11            ; a0[4-7]
+    pmaddwd     m10, m8, [w4_min_w6]
+    pmaddwd     m11, m9, [w4_min_w6]
+    paddd       m6,  m10           ; a3[0-3]
+    paddd       m7,  m11           ; a3[4-7]
+    pmaddwd     m10, m8, [w4_min_w2]
+    pmaddwd     m11, m9, [w4_min_w2]
+    pmaddwd     m8, [w4_plus_w2]
+    pmaddwd     m9, [w4_plus_w2]
+    psubd       m4,  m10           ; a2[0-3] intermediate
+    psubd       m5,  m11           ; a2[4-7] intermediate
+    psubd       m2,  m8            ; a1[0-3] intermediate
+    psubd       m3,  m9            ; a1[4-7] intermediate
+
+    ; load/store
+    mova   [COEFFS+  0], m0
+    mova   [COEFFS+ 32], m2
+    mova   [COEFFS+ 64], m4
+    mova   [COEFFS+ 96], m6
+    mova        m10,[COEFFS+ 16]       ; { row[1] }[0-7]
+    mova        m8, [COEFFS+ 48]       ; { row[3] }[0-7]
+    mova        m13,[COEFFS+ 80]       ; { row[5] }[0-7]
+    mova        m14,[COEFFS+112]       ; { row[7] }[0-7]
+    mova   [COEFFS+ 16], m1
+    mova   [COEFFS+ 48], m3
+    mova   [COEFFS+ 80], m5
+    mova   [COEFFS+112], m7
+%if %0 == 3
+    pmullw      m10,[%3+ 16]
+    pmullw      m8, [%3+ 48]
+    pmullw      m13,[%3+ 80]
+    pmullw      m14,[%3+112]
+%endif
+
+    ; b0 = MUL(W1, row[1]);
+    ; MAC(b0, W3, row[3]);
+    ; b1 = MUL(W3, row[1]);
+    ; MAC(b1, -W7, row[3]);
+    ; b2 = MUL(W5, row[1]);
+    ; MAC(b2, -W1, row[3]);
+    ; b3 = MUL(W7, row[1]);
+    ; MAC(b3, -W5, row[3]);
+    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
+    pmaddwd     m2,  m0, [w3_min_w7]
+    pmaddwd     m3,  m1, [w3_min_w7]
+    pmaddwd     m4,  m0, [w5_min_w1]
+    pmaddwd     m5,  m1, [w5_min_w1]
+    pmaddwd     m6,  m0, [w7_min_w5]
+    pmaddwd     m7,  m1, [w7_min_w5]
+    pmaddwd     m0, [w1_plus_w3]
+    pmaddwd     m1, [w1_plus_w3]
+
+    ; b0: +1*row[1]+2*row[3]
+    ; b1: +2*row[1]-1*row[3]
+    ; b2: -1*row[1]-1*row[3]
+    ; b3: +1*row[1]+1*row[3]
+
+    ; MAC(b0,  W5, row[5]);
+    ; MAC(b0,  W7, row[7]);
+    ; MAC(b1, -W1, row[5]);
+    ; MAC(b1, -W5, row[7]);
+    ; MAC(b2,  W7, row[5]);
+    ; MAC(b2,  W3, row[7]);
+    ; MAC(b3,  W3, row[5]);
+    ; MAC(b3, -W1, row[7]);
+    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
+
+    ; b0: -1*row[5]+1*row[7]
+    ; b1: -1*row[5]+1*row[7]
+    ; b2: +1*row[5]+2*row[7]
+    ; b3: +2*row[5]-1*row[7]
+
+    pmaddwd     m10, m8, [w1_plus_w5]
+    pmaddwd     m11, m9, [w1_plus_w5]
+    pmaddwd     m12, m8, [w5_plus_w7]
+    pmaddwd     m13, m9, [w5_plus_w7]
+    psubd       m2,  m10           ; b1[0-3]
+    psubd       m3,  m11           ; b1[4-7]
+    paddd       m0,  m12            ; b0[0-3]
+    paddd       m1,  m13            ; b0[4-7]
+    pmaddwd     m12, m8, [w7_plus_w3]
+    pmaddwd     m13, m9, [w7_plus_w3]
+    pmaddwd     m8, [w3_min_w1]
+    pmaddwd     m9, [w3_min_w1]
+    paddd       m4,  m12           ; b2[0-3]
+    paddd       m5,  m13           ; b2[4-7]
+    paddd       m6,  m8            ; b3[0-3]
+    paddd       m7,  m9            ; b3[4-7]
+
+    ; row[0] = (a0 + b0) >> 15;
+    ; row[7] = (a0 - b0) >> 15;
+    ; row[1] = (a1 + b1) >> 15;
+    ; row[6] = (a1 - b1) >> 15;
+    ; row[2] = (a2 + b2) >> 15;
+    ; row[5] = (a2 - b2) >> 15;
+    ; row[3] = (a3 + b3) >> 15;
+    ; row[4] = (a3 - b3) >> 15;
+    mova        m8, [COEFFS+ 0]        ; a0[0-3]
+    mova        m9, [COEFFS+16]        ; a0[4-7]
+    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
+    mova        m0, [COEFFS+32]        ; a1[0-3]
+    mova        m1, [COEFFS+48]        ; a1[4-7]
+    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
+    mova        m1, [COEFFS+64]        ; a2[0-3]
+    mova        m2, [COEFFS+80]        ; a2[4-7]
+    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
+    mova        m2, [COEFFS+96]        ; a3[0-3]
+    mova        m3, [COEFFS+112]       ; a3[4-7]
+    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
+%endmacro
+
+; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
+;                                  int16_t *block, const int16_t *qmat);
+
+; %1 = row shift
+; %2 = row bias macro
+; %3 = column shift
+; %4 = column bias macro
+; %5 = min pixel value
+; %6 = max pixel value
+; %7 = qmat (for prores)
+
+%macro IDCT_FN 4-7
+%if %0 == 4
+    ; No clamping, means pure idct
+%xdefine COEFFS r0
+%else
+    movsxd      r1,  r1d
+%xdefine COEFFS r2
+%endif
+
+    ; for (i = 0; i < 8; i++)
+    ;     idctRowCondDC(block + i*8);
+    mova        m10,[COEFFS+ 0]        ; { row[0] }[0-7]
+    mova        m8, [COEFFS+32]        ; { row[2] }[0-7]
+    mova        m13,[COEFFS+64]        ; { row[4] }[0-7]
+    mova        m12,[COEFFS+96]        ; { row[6] }[0-7]
+
+%if %0 == 7
+    pmullw      m10,[%7+ 0]
+    pmullw      m8, [%7+32]
+    pmullw      m13,[%7+64]
+    pmullw      m12,[%7+96]
+
+    IDCT_1D     %1, %2, %7
+%else
+    IDCT_1D     %1, %2
+%endif
+
+    ; transpose for second part of IDCT
+    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
+    mova   [COEFFS+ 16], m0
+    mova   [COEFFS+ 48], m2
+    mova   [COEFFS+ 80], m11
+    mova   [COEFFS+112], m10
+    SWAP         8,  10
+    SWAP         1,   8
+    SWAP         4,  13
+    SWAP         9,  12
+
+    ; for (i = 0; i < 8; i++)
+    ;     idctSparseColAdd(dest + i, line_size, block + i);
+    IDCT_1D     %3, %4
+
+    ; clip/store
+%if %0 == 4
+    ; No clamping, means pure idct
+    mova  [r0+  0], m8
+    mova  [r0+ 16], m0
+    mova  [r0+ 32], m1
+    mova  [r0+ 48], m2
+    mova  [r0+ 64], m4
+    mova  [r0+ 80], m11
+    mova  [r0+ 96], m9
+    mova  [r0+112], m10
+%else
+%ifidn %5, 0
+    pxor        m3, m3
+%else
+    mova        m3, [%5]
+%endif
+    mova        m5, [%6]
+    pmaxsw      m8,  m3
+    pmaxsw      m0,  m3
+    pmaxsw      m1,  m3
+    pmaxsw      m2,  m3
+    pmaxsw      m4,  m3
+    pmaxsw      m11, m3
+    pmaxsw      m9,  m3
+    pmaxsw      m10, m3
+    pminsw      m8,  m5
+    pminsw      m0,  m5
+    pminsw      m1,  m5
+    pminsw      m2,  m5
+    pminsw      m4,  m5
+    pminsw      m11, m5
+    pminsw      m9,  m5
+    pminsw      m10, m5
+
+    lea         r2, [r1*3]
+    mova  [r0     ], m8
+    mova  [r0+r1  ], m0
+    mova  [r0+r1*2], m1
+    mova  [r0+r2  ], m2
+    lea         r0, [r0+r1*4]
+    mova  [r0     ], m4
+    mova  [r0+r1  ], m11
+    mova  [r0+r1*2], m9
+    mova  [r0+r2  ], m10
+%endif
+%endmacro
+
+%endif
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000000..218e6864db
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        __asm__ volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pcmpeqd   %%xmm3, %%xmm3         \n\t"
+            "psllw         $1, %%xmm3         \n\t"
+            "paddw     %%xmm7, %%xmm3         \n\t"
+            "psllw        $13, %%xmm3         \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "paddw  %%xmm7, %%xmm2        \n\t"
+                "paddw  %%xmm7, %%xmm6        \n\t"
+                "pmulhw %%xmm3, %%xmm2        \n\t"
+                "pmulhw %%xmm3, %%xmm6        \n\t"
+                "paddw    (%0), %%xmm2        \n\t"
+                "paddw  16(%0), %%xmm6        \n\t"
+                "movdqa %%xmm2, (%0)          \n\t"
+                "movdqa %%xmm6, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubw  %%xmm2, %%xmm0        \n\t"
+                "psubw  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+        IDWTELEM b_0 = b[0];
+
+        i = 0;
+        __asm__ volatile(
+            "psllw         $15, %%xmm7        \n\t"
+            "pcmpeqw    %%xmm6, %%xmm6        \n\t"
+            "psrlw         $13, %%xmm6        \n\t"
+            "paddw      %%xmm7, %%xmm6        \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm0        \n\t"
+                "movdqu 16(%1), %%xmm4        \n\t"
+                "movdqu  2(%1), %%xmm1        \n\t"
+                "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
+                "paddw  %%xmm6, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "paddw  %%xmm7, %%xmm1        \n\t"
+                "paddw  %%xmm7, %%xmm5        \n\t"
+                "pavgw  %%xmm1, %%xmm0        \n\t"
+                "pavgw  %%xmm5, %%xmm4        \n\t"
+                "psubw  %%xmm7, %%xmm0        \n\t"
+                "psubw  %%xmm7, %%xmm4        \n\t"
+                "psraw      $1, %%xmm0        \n\t"
+                "psraw      $1, %%xmm4        \n\t"
+                "movdqa   (%0), %%xmm1        \n\t"
+                "movdqa 16(%0), %%xmm5        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "psraw      $2, %%xmm0        \n\t"
+                "psraw      $2, %%xmm4        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw    (%1), %%xmm2        \n\t"
+                "paddw  16(%1), %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddw  %%xmm2, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "psraw      $1, %%xmm2        \n\t"
+                "psraw      $1, %%xmm6        \n\t"
+                "paddw  %%xmm0, %%xmm2        \n\t"
+                "paddw  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x3E) != 0x3E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=62; i>=0; i-=64){
+            __asm__ volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpcklwd   (%2), %%xmm0       \n\t"
+                "punpcklwd 16(%2), %%xmm2       \n\t"
+                "punpcklwd 32(%2), %%xmm4       \n\t"
+                "punpcklwd 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhwd   (%2), %%xmm1       \n\t"
+                "punpckhwd 16(%2), %%xmm3       \n\t"
+                "punpckhwd 32(%2), %%xmm5       \n\t"
+                "punpckhwd 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        __asm__ volatile(
+            "pcmpeqw    %%mm7, %%mm7         \n\t"
+            "pcmpeqw    %%mm3, %%mm3         \n\t"
+            "psllw         $1, %%mm3         \n\t"
+            "paddw      %%mm7, %%mm3         \n\t"
+            "psllw        $13, %%mm3         \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "paddw   %%mm7, %%mm2        \n\t"
+                "paddw   %%mm7, %%mm6        \n\t"
+                "pmulhw  %%mm3, %%mm2        \n\t"
+                "pmulhw  %%mm3, %%mm6        \n\t"
+                "paddw    (%0), %%mm2        \n\t"
+                "paddw   8(%0), %%mm6        \n\t"
+                "movq    %%mm2, (%0)         \n\t"
+                "movq    %%mm6, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubw   %%mm2, %%mm0        \n\t"
+                "psubw   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+        __asm__ volatile(
+            "psllw         $15, %%mm7        \n\t"
+            "pcmpeqw     %%mm6, %%mm6        \n\t"
+            "psrlw         $13, %%mm6        \n\t"
+            "paddw       %%mm7, %%mm6        \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm0        \n\t"
+                "movq    8(%1), %%mm4        \n\t"
+                "movq    2(%1), %%mm1        \n\t"
+                "movq   10(%1), %%mm5        \n\t"
+                "paddw   %%mm6, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "paddw   %%mm7, %%mm1        \n\t"
+                "paddw   %%mm7, %%mm5        \n\t"
+                "pavgw   %%mm1, %%mm0        \n\t"
+                "pavgw   %%mm5, %%mm4        \n\t"
+                "psubw   %%mm7, %%mm0        \n\t"
+                "psubw   %%mm7, %%mm4        \n\t"
+                "psraw      $1, %%mm0        \n\t"
+                "psraw      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "psraw      $2, %%mm0        \n\t"
+                "psraw      $2, %%mm4        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq    2(%1), %%mm2        \n\t"
+                "movq   10(%1), %%mm6        \n\t"
+                "paddw    (%1), %%mm2        \n\t"
+                "paddw   8(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "paddw   %%mm2, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "psraw      $1, %%mm2        \n\t"
+                "psraw      $1, %%mm6        \n\t"
+                "paddw   %%mm0, %%mm2        \n\t"
+                "paddw   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            __asm__ volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpcklwd   (%2), %%mm0       \n\t"
+                "punpcklwd  8(%2), %%mm2       \n\t"
+                "punpcklwd 16(%2), %%mm4       \n\t"
+                "punpcklwd 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhwd   (%2), %%mm1       \n\t"
+                "punpckhwd  8(%2), %%mm3       \n\t"
+                "punpckhwd 16(%2), %%mm5       \n\t"
+                "punpckhwd 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"FF_REG_d"), %%"t0"      \n\t"\
+        ""op" 16("r",%%"FF_REG_d"), %%"t1"    \n\t"\
+        ""op" 32("r",%%"FF_REG_d"), %%"t2"    \n\t"\
+        ""op" 48("r",%%"FF_REG_d"), %%"t3"    \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "psubw %%"s0", %%"t0" \n\t"\
+        "psubw %%"s1", %%"t1" \n\t"\
+        "psubw %%"s2", %%"t2" \n\t"\
+        "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+        "movdqa %%"s0", ("w",%%"FF_REG_d")    \n\t"\
+        "movdqa %%"s1", 16("w",%%"FF_REG_d")  \n\t"\
+        "movdqa %%"s2", 32("w",%%"FF_REG_d")  \n\t"\
+        "movdqa %%"s3", 48("w",%%"FF_REG_d")  \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+        "psraw $"n", %%"t0" \n\t"\
+        "psraw $"n", %%"t1" \n\t"\
+        "psraw $"n", %%"t2" \n\t"\
+        "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "paddw %%"s0", %%"t0" \n\t"\
+        "paddw %%"s1", %%"t1" \n\t"\
+        "paddw %%"s2", %%"t2" \n\t"\
+        "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "pmulhw %%"s0", %%"t0" \n\t"\
+        "pmulhw %%"s1", %%"t1" \n\t"\
+        "pmulhw %%"s2", %%"t2" \n\t"\
+        "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movdqa %%"s0", %%"t0" \n\t"\
+        "movdqa %%"s1", %%"t1" \n\t"\
+        "movdqa %%"s2", %%"t2" \n\t"\
+        "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+
+    while(i & 0x1F)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+
+         __asm__ volatile (
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+        "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
+        "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm0, %%xmm2                   \n\t"
+        "psllw         $13, %%xmm2                   \n\t"
+        snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+        "pcmpeqw %%xmm7, %%xmm7                      \n\t"
+        "pcmpeqw %%xmm5, %%xmm5                      \n\t"
+        "psllw $15, %%xmm7                           \n\t"
+        "psrlw $13, %%xmm5                           \n\t"
+        "paddw %%xmm7, %%xmm5                        \n\t"
+        snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+        "movq   (%2,%%"FF_REG_d"), %%xmm1            \n\t"
+        "movq  8(%2,%%"FF_REG_d"), %%xmm3            \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm0                        \n\t"
+        "pavgw %%xmm3, %%xmm2                        \n\t"
+        "movq 16(%2,%%"FF_REG_d"), %%xmm1            \n\t"
+        "movq 24(%2,%%"FF_REG_d"), %%xmm3            \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm4                        \n\t"
+        "pavgw %%xmm3, %%xmm6                        \n\t"
+        snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+        snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+        "2:                                          \n\t"
+        "sub $64, %%"FF_REG_d"                       \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"FF_REG_d"), %%"t0"   \n\t"\
+        ""op" 8("r",%%"FF_REG_d"), %%"t1"  \n\t"\
+        ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
+        ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+        "movq %%"s0", ("w",%%"FF_REG_d")   \n\t"\
+        "movq %%"s1", 8("w",%%"FF_REG_d")  \n\t"\
+        "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
+        "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movq %%"s0", %%"t0" \n\t"\
+        "movq %%"s1", %%"t1" \n\t"\
+        "movq %%"s2", %%"t2" \n\t"\
+        "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+    while(i & 15)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+    __asm__ volatile(
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+        "pcmpeqw    %%mm0, %%mm0                     \n\t"
+        "pcmpeqw    %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm0, %%mm2                     \n\t"
+        "psllw        $13, %%mm2                     \n\t"
+        snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+        "pcmpeqw %%mm7, %%mm7                        \n\t"
+        "pcmpeqw %%mm5, %%mm5                        \n\t"
+        "psllw $15, %%mm7                            \n\t"
+        "psrlw $13, %%mm5                            \n\t"
+        "paddw %%mm7, %%mm5                          \n\t"
+        snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+        "movq   (%2,%%"FF_REG_d"), %%mm1             \n\t"
+        "movq  8(%2,%%"FF_REG_d"), %%mm3             \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm0                          \n\t"
+        "pavgw %%mm3, %%mm2                          \n\t"
+        "movq 16(%2,%%"FF_REG_d"), %%mm1             \n\t"
+        "movq 24(%2,%%"FF_REG_d"), %%mm3             \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm4                          \n\t"
+        "pavgw %%mm3, %%mm6                          \n\t"
+        snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+        snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+        "2:                                          \n\t"
+        "sub $32, %%"FF_REG_d"                       \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"FF_REG_c"          \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"FF_REG_S"          \n\t"\
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
+             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
+             "psllw $15, %%xmm3              \n\t"\
+             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"FF_REG_D"           \n\t"\
+             "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+             "add %3, %%"FF_REG_D"           \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\
+             "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2"             \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"FF_REG_S"), %%xmm0    \n\t"\
+             "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\
+             "movq 8(%%"FF_REG_d"), %%"out_reg2"                          \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"FF_REG_S"), %%xmm0   \n\t"\
+             "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+             "add $32, %%"FF_REG_S"                            \n\t"\
+             "add %%"FF_REG_c", %0                             \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+             "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+             "sal $1, %%"FF_REG_c"                \n\t"\
+             "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "sar $1, %%"FF_REG_c"           \n\t"\
+             "sub $2, %2                     \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+             "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "dec %2                         \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+             "mov %0, %%"FF_REG_d"           \n\t"
+             "movdqa (%%"FF_REG_D"), %%xmm0  \n\t"
+             "movdqa %%xmm1, %%xmm2          \n\t"
+
+             "punpckhwd %%xmm7, %%xmm1       \n\t"
+             "punpcklwd %%xmm7, %%xmm2       \n\t"
+             "paddd %%xmm2, %%xmm0           \n\t"
+             "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
+             "paddd %%xmm1, %%xmm2           \n\t"
+             "paddd %%xmm3, %%xmm0           \n\t"
+             "paddd %%xmm3, %%xmm2           \n\t"
+
+             "mov %1, %%"FF_REG_D"           \n\t"
+             "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
+             "add %3, %%"FF_REG_D"           \n\t"
+
+             "movdqa (%%"FF_REG_D"), %%xmm4  \n\t"
+             "movdqa %%xmm5, %%xmm6          \n\t"
+             "punpckhwd %%xmm7, %%xmm5       \n\t"
+             "punpcklwd %%xmm7, %%xmm6       \n\t"
+             "paddd %%xmm6, %%xmm4           \n\t"
+             "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
+             "paddd %%xmm5, %%xmm6           \n\t"
+             "paddd %%xmm3, %%xmm4           \n\t"
+             "paddd %%xmm3, %%xmm6           \n\t"
+
+             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm2, %%xmm0        \n\t"
+             "packuswb %%xmm7, %%xmm0        \n\t"
+             "movq %%xmm0, (%%"FF_REG_d")    \n\t"
+
+             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm6, %%xmm4        \n\t"
+             "packuswb %%xmm7, %%xmm4        \n\t"
+             "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+             "mov %0, %%"FF_REG_d"           \n\t"
+             "psrlw $4, %%xmm1               \n\t"
+             "psrlw $4, %%xmm5               \n\t"
+             "paddw   (%%"FF_REG_D"), %%xmm1 \n\t"
+             "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
+             "paddw %%xmm3, %%xmm1           \n\t"
+             "paddw %%xmm3, %%xmm5           \n\t"
+             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
+             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
+             "packuswb %%xmm5, %%xmm1        \n\t"
+
+             "movdqu %%xmm1, (%%"FF_REG_d")  \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"FF_REG_c"          \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"FF_REG_S"          \n\t"\
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
+             "pcmpeqd %%mm3, %%mm3           \n\t"\
+             "psllw $15, %%mm3               \n\t"\
+             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"FF_REG_D"           \n\t"\
+             "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+             "add %3, %%"FF_REG_D"           \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1"                 \n\t"\
+             "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2"               \n\t"\
+             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+             "movd "s_offset"(%%"FF_REG_S"), %%mm0   \n\t"\
+             "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
+             "punpcklbw %%mm7, %%mm0       \n\t"\
+             "punpcklbw %%mm7, %%mm4       \n\t"\
+             "pmullw %%mm0, %%"out_reg1"   \n\t"\
+             "pmullw %%mm4, %%"out_reg2"   \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+             "paddusw %%mm2, %%mm1         \n\t"\
+             "paddusw %%mm6, %%mm5         \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+             "mov %0, %%"FF_REG_d"           \n\t"\
+             "psrlw $4, %%mm1                \n\t"\
+             "psrlw $4, %%mm5                \n\t"\
+             "paddw "read_offset"(%%"FF_REG_D"), %%mm1   \n\t"\
+             "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
+             "paddw %%mm3, %%mm1             \n\t"\
+             "paddw %%mm3, %%mm5             \n\t"\
+             "psraw $4, %%mm1                \n\t"\
+             "psraw $4, %%mm5                \n\t"\
+             "packuswb %%mm5, %%mm1          \n\t"\
+             "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+             "add $"s_step", %%"FF_REG_S"                      \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"\
+             "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1             \n\t"\
+             "add %%"FF_REG_c", %0                             \n\t"\
+             "dec %2                         \n\t"\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+        }
+        else{
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm
new file mode 100644
index 0000000000..a87632836d
--- /dev/null
+++ b/libavcodec/x86/svq1enc.asm
@@ -0,0 +1,61 @@
+;******************************************************************************
+;* SIMD-optimized SVQ1 encoder functions
+;* Copyright (c) 2007 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSD_INT8_VS_INT16 0
+cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size
+    pxor m0, m0
+.loop:
+    sub       sizeq, 8
+    movq      m1, [pix1q + sizeq]
+    mova      m2, [pix2q + sizeq*2]
+%if mmsize == 8
+    movq      m3, [pix2q + sizeq*2 + mmsize]
+    punpckhbw m4, m1
+    punpcklbw m1, m1
+    psraw     m4, 8
+    psraw     m1, 8
+    psubw     m3, m4
+    psubw     m2, m1
+    pmaddwd   m3, m3
+    pmaddwd   m2, m2
+    paddd     m0, m3
+    paddd     m0, m2
+%else
+    punpcklbw m1, m1
+    psraw     m1, 8
+    psubw     m2, m1
+    pmaddwd   m2, m2
+    paddd     m0, m2
+%endif
+    jg .loop
+    HADDD     m0, m1
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmx
+SSD_INT8_VS_INT16
+INIT_XMM sse2
+SSD_INT8_VS_INT16
diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c
deleted file mode 100644
index 02b0a84b8c..0000000000
--- a/libavcodec/x86/svq1enc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/svq1enc.h"
-
-#if HAVE_INLINE_ASM
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
-                                 int size)
-{
-    int sum;
-    x86_reg i = size;
-
-    __asm__ volatile (
-        "pxor %%mm4, %%mm4 \n"
-        "1: \n"
-        "sub $8, %0 \n"
-        "movq (%2, %0), %%mm2 \n"
-        "movq (%3, %0, 2), %%mm0 \n"
-        "movq 8(%3, %0, 2), %%mm1 \n"
-        "punpckhbw %%mm2, %%mm3 \n"
-        "punpcklbw %%mm2, %%mm2 \n"
-        "psraw $8, %%mm3 \n"
-        "psraw $8, %%mm2 \n"
-        "psubw %%mm3, %%mm1 \n"
-        "psubw %%mm2, %%mm0 \n"
-        "pmaddwd %%mm1, %%mm1 \n"
-        "pmaddwd %%mm0, %%mm0 \n"
-        "paddd %%mm1, %%mm4 \n"
-        "paddd %%mm0, %%mm4 \n"
-        "jg 1b \n"
-        "movq %%mm4, %%mm3 \n"
-        "psrlq $32, %%mm3 \n"
-        "paddd %%mm3, %%mm4 \n"
-        "movd %%mm4, %1 \n"
-        : "+r" (i), "=r" (sum)
-        : "r" (pix1), "r" (pix2));
-
-    return sum;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c
new file mode 100644
index 0000000000..40b4b0e183
--- /dev/null
+++ b/libavcodec/x86/svq1enc_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+                             intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+                              intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/synth_filter.asm b/libavcodec/x86/synth_filter.asm
new file mode 100644
index 0000000000..bc1a48f409
--- /dev/null
+++ b/libavcodec/x86/synth_filter.asm
@@ -0,0 +1,246 @@
+;******************************************************************************
+;* SSE-optimized functions for the DCA decoder
+;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+    pxor          %1, %1
+%else
+    xorps         %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+    mova          %3, [%2 - 16]
+    vperm2f128    %1, %3, %3, 1
+    vshufps       %1, %1, %1, q0123
+%elif cpuflag(sse2)
+    pshufd        %1, [%2], q0123
+%else
+    mova          %1, [%2]
+    shufps        %1, %1, q0123
+%endif
+%endmacro
+
+%macro INNER_LOOP   1
+    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
+    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
+    ;~ b += window[i + j + 16] * (synth_buf[i + j])
+    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
+    mova          m6, [ptr1 + j]
+%if ARCH_X86_64
+    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
+    mova         m12, [ptr1 + j + mmsize]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
+    fnmaddps      m1, m5,  [win + %1 + j], m1
+%if ARCH_X86_64
+    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
+    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
+%endif
+%else ; non-FMA
+    mulps         m6, m6,  [win + %1 + j + 16 * 4]
+    mulps         m5, m5,  [win + %1 + j]
+%if ARCH_X86_64
+    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
+    mulps        m11, m11, [win + %1 + j + mmsize]
+%endif
+    addps         m2, m2, m6
+    subps         m1, m1, m5
+%if ARCH_X86_64
+    addps         m8, m8, m12
+    subps         m7, m7, m11
+%endif
+%endif ; cpuflag(fma3)
+    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
+    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
+    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
+    mova          m5, [ptr1 + j + 16 * 4]
+%if ARCH_X86_64
+    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
+    mova         m11, [ptr1 + j + mmsize + 16 * 4]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
+    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
+%if ARCH_X86_64
+    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
+    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
+%endif
+%else ; non-FMA
+    mulps         m5, m5,  [win + %1 + j + 32 * 4]
+    mulps         m6, m6,  [win + %1 + j + 48 * 4]
+%if ARCH_X86_64
+    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
+    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
+%endif
+    addps         m3, m3, m5
+    addps         m4, m4, m6
+%if ARCH_X86_64
+    addps         m9, m9, m11
+    addps        m10, m10, m12
+%endif
+%endif ; cpuflag(fma3)
+    sub            j, 64 * 4
+%endmacro
+
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+;                                  const float window[512], float out[32],
+;                                  intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
+cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
+                              synth_buf, synth_buf2, window, out, off, scale
+%define scale m0
+%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+    movd       scale, scalem
+    SPLATD        m0
+%else
+    VBROADCASTSS  m0, scalem
+%endif
+; Make sure offset is in a register and not on the stack
+%define OFFQ  r4q
+%else
+    SPLATD      xmm0
+%if cpuflag(avx)
+    vinsertf128   m0, m0, xmm0, 1
+%endif
+%define OFFQ  offq
+%endif
+    ; prepare inner counter limit 1
+    mov          r5q, 480
+    sub          r5q, offmp
+    and          r5q, -64
+    shl          r5q, 2
+%if ARCH_X86_32 || notcpuflag(avx)
+    mov         OFFQ, r5q
+%define i        r5q
+    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+%else
+%define i 0
+%define OFFQ  r5q
+%endif
+
+%define buf2     synth_buf2q
+%if ARCH_X86_32
+    mov         buf2, synth_buf2mp
+%endif
+.mainloop:
+    ; m1 = a  m2 = b  m3 = c  m4 = d
+    SETZERO       m3
+    SETZERO       m4
+    mova          m1, [buf2 + i]
+    mova          m2, [buf2 + i + 16 * 4]
+%if ARCH_X86_32
+%define ptr1     r0q
+%define ptr2     r1q
+%define win      r2q
+%define j        r3q
+    mov          win, windowm
+    mov         ptr1, synth_bufm
+%if ARCH_X86_32 || notcpuflag(avx)
+    add          win, i
+    add         ptr1, i
+%endif
+%else ; ARCH_X86_64
+%define ptr1     r6q
+%define ptr2     r7q ; must be loaded
+%define win      r8q
+%define j        r9q
+    SETZERO       m9
+    SETZERO      m10
+    mova          m7, [buf2 + i + mmsize]
+    mova          m8, [buf2 + i + mmsize + 16 * 4]
+    lea          win, [windowq + i]
+    lea         ptr1, [synth_bufq + i]
+%endif
+    mov         ptr2, synth_bufmp
+    ; prepare the inner loop counter
+    mov            j, OFFQ
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub         ptr2, i
+%endif
+.loop1:
+    INNER_LOOP  0
+    jge       .loop1
+
+    mov            j, 448 * 4
+    sub            j, OFFQ
+    jz          .end
+    sub         ptr1, j
+    sub         ptr2, j
+    add          win, OFFQ ; now at j-64, so define OFFSET
+    sub            j, 64 * 4
+.loop2:
+    INNER_LOOP  64 * 4
+    jge       .loop2
+
+.end:
+%if ARCH_X86_32
+    mov         buf2, synth_buf2m ; needed for next iteration anyway
+    mov         outq, outmp       ; j, which will be set again during it
+%endif
+    ;~ out[i]      = a * scale;
+    ;~ out[i + 16] = b * scale;
+    mulps         m1, m1, scale
+    mulps         m2, m2, scale
+%if ARCH_X86_64
+    mulps         m7, m7, scale
+    mulps         m8, m8, scale
+%endif
+    ;~ synth_buf2[i]      = c;
+    ;~ synth_buf2[i + 16] = d;
+    mova   [buf2 + i +  0 * 4], m3
+    mova   [buf2 + i + 16 * 4], m4
+%if ARCH_X86_64
+    mova   [buf2 + i +  0 * 4 + mmsize], m9
+    mova   [buf2 + i + 16 * 4 + mmsize], m10
+%endif
+    ;~ out[i]      = a;
+    ;~ out[i + 16] = a;
+    mova   [outq + i +  0 * 4], m1
+    mova   [outq + i + 16 * 4], m2
+%if ARCH_X86_64
+    mova   [outq + i +  0 * 4 + mmsize], m7
+    mova   [outq + i + 16 * 4 + mmsize], m8
+%endif
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub            i, (ARCH_X86_64 + 1) * mmsize
+    jge    .mainloop
+%endif
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+INIT_YMM avx
+SYNTH_FILTER
+INIT_YMM fma3
+SYNTH_FILTER
diff --git a/libavcodec/x86/synth_filter_init.c b/libavcodec/x86/synth_filter_init.c
new file mode 100644
index 0000000000..9ef00cdb0a
--- /dev/null
+++ b/libavcodec/x86/synth_filter_init.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt)                                                 \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
+                                 const float window[512],                      \
+                                 float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct,                              \
+                               float *synth_buf_ptr, int *synth_buf_offset,    \
+                               float synth_buf2[32], const float window[512],  \
+                               float out[32], const float in[32], float scale) \
+{                                                                              \
+    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
+                                                                               \
+    imdct->imdct_half(imdct, synth_buf, in);                                   \
+                                                                               \
+    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
+                                out, *synth_buf_offset, scale);                \
+                                                                               \
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
+}                                                                              \
+
+#if HAVE_YASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_YASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse2;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_avx;
+    }
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_fma3;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
new file mode 100644
index 0000000000..5f3ded3ea2
--- /dev/null
+++ b/libavcodec/x86/takdsp.asm
@@ -0,0 +1,116 @@
+;******************************************************************************
+;* TAK DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_128: times 4 dd 128
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+.loop:
+    mova                         m0, [p1q+lengthq+mmsize*0]
+    mova                         m1, [p1q+lengthq+mmsize*1]
+    paddd                        m0, [p2q+lengthq+mmsize*0]
+    paddd                        m1, [p2q+lengthq+mmsize*1]
+    mova     [p2q+lengthq+mmsize*0], m0
+    mova     [p2q+lengthq+mmsize*1], m1
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+
+.loop:
+    mova                         m0, [p2q+lengthq+mmsize*0]
+    mova                         m1, [p2q+lengthq+mmsize*1]
+    psubd                        m0, [p1q+lengthq+mmsize*0]
+    psubd                        m1, [p1q+lengthq+mmsize*1]
+    mova     [p1q+lengthq+mmsize*0], m0
+    mova     [p1q+lengthq+mmsize*1], m1
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+
+.loop:
+    mova                         m0, [p1q+lengthq]
+    mova                         m1, [p2q+lengthq]
+    mova                         m3, [p1q+lengthq+mmsize]
+    mova                         m4, [p2q+lengthq+mmsize]
+    mova                         m2, m1
+    mova                         m5, m4
+    psrad                        m2, 1
+    psrad                        m5, 1
+    psubd                        m0, m2
+    psubd                        m3, m5
+    paddd                        m1, m0
+    paddd                        m4, m3
+    mova              [p1q+lengthq], m0
+    mova              [p2q+lengthq], m1
+    mova       [p1q+lengthq+mmsize], m3
+    mova       [p2q+lengthq+mmsize], m4
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+INIT_XMM sse4
+cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
+    shl             lengthd, 2
+    add                 p1q, lengthq
+    add                 p2q, lengthq
+    neg             lengthq
+
+    movd                 m2, dshiftm
+    movd                 m3, dfactorm
+    pshufd               m3, m3, 0
+    mova                 m4, [pd_128]
+
+.loop:
+    mova                 m0, [p1q+lengthq]
+    mova                 m1, [p2q+lengthq]
+    psrad                m1, m2
+    pmulld               m1, m3
+    paddd                m1, m4
+    psrad                m1, 8
+    pslld                m1, m2
+    psubd                m1, m0
+    mova      [p1q+lengthq], m1
+    add             lengthq, mmsize
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
new file mode 100644
index 0000000000..555d0649c9
--- /dev/null
+++ b/libavcodec/x86/takdsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/takdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+
+av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
+        c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
+        c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
+    }
+#endif
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000000..db12a32eca
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal tta_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+    movd       m0, [inq]            ; filter->error = *in;
+    movd       [errorq], m0         ;
+
+    movd       m2, shiftm           ; *in += (sum >> filter->shift);
+    psrad      m6, m2               ;
+    paddd      m0, m6               ;
+    movd       [inq], m0            ;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4,  7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000000..aa998c1afd
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tta_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                 int32_t *error, int32_t *in, int32_t shift,
+                                 int32_t round);
+void ff_tta_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                int32_t *error, int32_t *in, int32_t shift,
+                                int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->filter_process = ff_tta_filter_process_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->filter_process = ff_tta_filter_process_sse4;
+#endif
+}
diff --git a/libavcodec/x86/ttaencdsp.asm b/libavcodec/x86/ttaencdsp.asm
new file mode 100644
index 0000000000..c9cbd49874
--- /dev/null
+++ b/libavcodec/x86/ttaencdsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA Encoder DSP SIMD optimizations
+;*
+;* Copyright (C) 2014-2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTAENC_FILTER 2
+INIT_XMM %1
+cglobal ttaenc_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+
+    movd       m2, shiftm           ;
+    movd       m0, [inq]            ;
+    psrad      m6, m2               ;
+    psubd      m3, m0, m6           ;
+    movd       [inq], m3            ; *in -= (sum >> filter->shift);
+    movd       [errorq], m3         ; filter->error = *in;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTAENC_FILTER ssse3, 8
+TTAENC_FILTER sse4,  7
diff --git a/libavcodec/x86/ttaencdsp_init.c b/libavcodec/x86/ttaencdsp_init.c
new file mode 100644
index 0000000000..c1a02fdac9
--- /dev/null
+++ b/libavcodec/x86/ttaencdsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014-2016 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttaencdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttaenc_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round);
+void ff_ttaenc_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                   int32_t *error, int32_t *in, int32_t shift,
+                                   int32_t round);
+
+av_cold void ff_ttaencdsp_init_x86(TTAEncDSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->filter_process = ff_ttaenc_filter_process_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->filter_process = ff_ttaenc_filter_process_sse4;
+#endif
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..f579307aa0
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void ff_v210_x86_init(V210DecContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..c24c765e5b
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,90 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 1
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1, 5, 5, 7
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop:
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+v210_planar_unpack unaligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack unaligned
+%endif
+
+INIT_XMM ssse3
+v210_planar_unpack aligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack aligned
+%endif
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 0db0196313..965f2bea3c 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -2,20 +2,20 @@
 ;* V210 SIMD pack
 ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -23,8 +23,9 @@
 
 SECTION_RODATA 32
 
-v210_enc_min_10: times 32 dw 0x4
-v210_enc_max_10: times 32 dw 0x3fb
+cextern pw_4
+%define v210_enc_min_10 pw_4
+v210_enc_max_10: times 16 dw 0x3fb
 
 v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0
 v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
@@ -32,16 +33,19 @@ v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
 v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0
 v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
 
-v210_enc_min_8: times 32 db 0x1
-v210_enc_max_8: times 32 db 0xfe
+cextern pb_1
+%define v210_enc_min_8 pb_1
+cextern pb_FE
+%define v210_enc_max_8 pb_FE
 
-v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
 v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
+v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
 
-v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
 v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
 v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
 
+v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -59,16 +63,16 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
 .loop:
     movu        xm0, [yq+2*widthq]
 %if cpuflag(avx2)
-    vinserti128 m0, m0, [yq+2*widthq+12], 1
+    vinserti128 m0,   m0, [yq+widthq*2+12], 1
 %endif
     CLIPW   m0, m2, m3
 
-    movq    xm1, [uq+widthq]
-    movhps  xm1, [vq+widthq]
+    movq         xm1, [uq+widthq]
+    movhps       xm1, [vq+widthq]
 %if cpuflag(avx2)
     movq         xm4, [uq+widthq+6]
     movhps       xm4, [vq+widthq+6]
-    vinserti128  m1, m1, xm4, 1
+    vinserti128  m1,   m1, xm4, 1
 %endif
     CLIPW   m1, m2, m3
 
@@ -93,6 +97,7 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
 INIT_XMM ssse3
 v210_planar_pack_10
 %endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 v210_planar_pack_10
@@ -113,9 +118,9 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
     pxor    m6, m6
 
 .loop:
-    movu        xm1, [yq+2*widthq]
+    movu        xm1, [yq+widthq*2]
 %if cpuflag(avx2)
-    vinserti128 m1, m1, [yq+2*widthq+12], 1
+    vinserti128 m1,   m1, [yq+widthq*2+12], 1
 %endif
     CLIPUB  m1, m4, m5
 
@@ -172,6 +177,7 @@ v210_planar_pack_8
 INIT_XMM avx
 v210_planar_pack_8
 %endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 v210_planar_pack_8
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index c4d2745b6f..e997b4b67a 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ada26..fdd4de1813 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
 /*
  * VC-1 and WMV3 decoder - X86 DSP init functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 8982ff908a..79d22a294f 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -27,6 +27,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavcodec/vc1dsp.h"
 #include "fpel.h"
 #include "vc1dsp.h"
@@ -63,11 +64,22 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
     ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
 }
 
-static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
-                                      ptrdiff_t stride, int rnd)
-{
-    ff_avg_pixels8_mmxext(dst, src, stride, 8);
-}
+#define DECLARE_FUNCTION(OP, DEPTH, INSN)                       \
+    static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst,          \
+                             const uint8_t *src, ptrdiff_t stride, int rnd) \
+    {                                                                       \
+        ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH);     \
+    }
+
+DECLARE_FUNCTION(put_,  8, _mmx)
+DECLARE_FUNCTION(put_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmx)
+DECLARE_FUNCTION(avg_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmxext)
+DECLARE_FUNCTION(avg_, 16, _mmxext)
+DECLARE_FUNCTION(put_, 16, _sse2)
+DECLARE_FUNCTION(avg_, 16, _sse2)
+
 #endif /* HAVE_YASM */
 
 void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
@@ -80,16 +92,24 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        ptrdiff_t stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        ptrdiff_t stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
 
 
 av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMX(cpu_flags) && EXTERNAL_MMX(cpu_flags))
         ff_vc1dsp_init_mmx(dsp);
 
-    if (INLINE_MMXEXT(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags) && EXTERNAL_MMXEXT(cpu_flags))
         ff_vc1dsp_init_mmxext(dsp);
 
 #define ASSIGN_LF(EXT) \
@@ -103,6 +123,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 #if HAVE_YASM
     if (EXTERNAL_MMX(cpu_flags)) {
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+
+        dsp->put_vc1_mspel_pixels_tab[1][0]      = put_vc1_mspel_mc00_8_mmx;
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmx;
     }
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
@@ -111,13 +136,22 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         ASSIGN_LF(mmxext);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
-        dsp->avg_vc1_mspel_pixels_tab[0]         = avg_vc1_mspel_mc00_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmxext;
+
+        dsp->vc1_inv_trans_8x8_dc                = ff_vc1_inv_trans_8x8_dc_mmxext;
+        dsp->vc1_inv_trans_4x8_dc                = ff_vc1_inv_trans_4x8_dc_mmxext;
+        dsp->vc1_inv_trans_8x4_dc                = ff_vc1_inv_trans_8x4_dc_mmxext;
+        dsp->vc1_inv_trans_4x4_dc                = ff_vc1_inv_trans_4x4_dc_mmxext;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_sse2;
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse2;
         dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
         dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_sse2;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_sse2;
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
         ASSIGN_LF(ssse3);
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp_loopfilter.asm
index adf08d7d84..1838f6f235 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@ -1,21 +1,21 @@
 ;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 loopfilter optimizations
 ;* Copyright (c) 2009 David Conrad
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
new file mode 100644
index 0000000000..2850ca861d
--- /dev/null
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -0,0 +1,292 @@
+;******************************************************************************
+;* VC1 motion compensation optimizations
+;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_9
+cextern pw_128
+
+section .text
+
+%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+;     when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+    pavgb           %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+    paddw           m3, m7 ; +bias-r
+    paddw           m4, m7 ; +bias-r
+    psraw           m3, %1
+    psraw           m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+    packuswb        m3, m4
+    %1              m3, [%2]
+    mova          [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+    %1              m3, [%2]
+    %1              m3, [%2 + mmsize]
+    mova          [%2], m3
+    mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+    punpcklbw       %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+    movd      m7, %1
+    punpcklwd m7, m7
+    punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+    paddw          m%3, m%4
+    movh           m%2, [srcq + stride_neg2]
+    pmullw         m%3, m6
+    punpcklbw      m%2, m0
+    movh           m%5, [srcq + strideq]
+    psubw          m%3, m%2
+    punpcklbw      m%5, m0
+    paddw          m%3, m7
+    psubw          m%3, m%5
+    psraw          m%3, shift
+    movu   [dstq + %1], m%3
+    add           srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+;                                    x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+    DECLARE_REG_TMP     3, 4, 5
+    %define rnd r3mp
+    %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+    DECLARE_REG_TMP     4, 5, 6
+    %define   rnd r3d
+    ; We need shift either in memory or in a mm reg as it's used in psraw
+    ; On WIN64, the arg is already on the stack
+    ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+    %define shift r4mp
+%else ; UNIX64
+    %define shift m5
+    mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+    mov       stride_neg2, strideq
+    neg       stride_neg2
+    add       stride_neg2, stride_neg2
+    lea    stride_9minus4, [strideq * 9 - 4]
+    mov                 i, 3
+    LOAD_ROUNDER_MMX  rnd
+    mova               m6, [pw_9]
+    pxor               m0, m0
+.loop:
+    movh               m2, [srcq]
+    add              srcq, strideq
+    movh               m3, [srcq]
+    punpcklbw          m2, m0
+    punpcklbw          m3, m0
+    SHIFT2_LINE         0, 1, 2, 3, 4
+    SHIFT2_LINE        24, 2, 3, 4, 1
+    SHIFT2_LINE        48, 3, 4, 1, 2
+    SHIFT2_LINE        72, 4, 1, 2, 3
+    SHIFT2_LINE        96, 1, 2, 3, 4
+    SHIFT2_LINE       120, 2, 3, 4, 1
+    SHIFT2_LINE       144, 3, 4, 1, 2
+    SHIFT2_LINE       168, 4, 1, 2, 3
+    sub              srcq, stride_9minus4
+    add              dstq, 8
+    dec                 i
+        jnz         .loop
+    REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+;                                  const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+    mov                hq, 8
+    sub              srcq, 2
+    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+    LOAD_ROUNDER_MMX rndd
+    mova               m5, [pw_9]
+    mova               m6, [pw_128]
+    pxor               m0, m0
+
+.loop:
+    mova               m1, [srcq + 2 * 0]
+    mova               m2, [srcq + 2 * 0 + mmsize]
+    mova               m3, [srcq + 2 * 1]
+    mova               m4, [srcq + 2 * 1 + mmsize]
+    paddw              m3, [srcq + 2 * 2]
+    paddw              m4, [srcq + 2 * 2 + mmsize]
+    paddw              m1, [srcq + 2 * 3]
+    paddw              m2, [srcq + 2 * 3 + mmsize]
+    pmullw             m3, m5
+    pmullw             m4, m5
+    psubw              m3, m1
+    psubw              m4, m2
+    NORMALIZE_MMX      7
+    ; remove bias
+    paddw              m3, m6
+    paddw              m4, m6
+    TRANSFER_DO_PACK   %1, dstq
+    add              srcq, 24
+    add              dstq, strideq
+    dec                hq
+        jnz         .loop
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
+%endif ; HAVE_MMX_INLINE
+
+%macro INV_TRANS_INIT 0
+    movsxdifnidn linesizeq, linesized
+    movd       m0, blockd
+    SPLATW     m0, m0
+    pxor       m1, m1
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+
+    DEFINE_ARGS dest, linesize, linesize3
+    lea    linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+    mov%1                  m2, [destq+linesizeq*0]
+    mov%1                  m3, [destq+linesizeq*1]
+    mov%1                  m4, [destq+linesizeq*2]
+    mov%1                  m5, [destq+linesize3q]
+    paddusb                m2, m0
+    paddusb                m3, m0
+    paddusb                m4, m0
+    paddusb                m5, m0
+    psubusb                m2, m1
+    psubusb                m3, m1
+    psubusb                m4, m1
+    psubusb                m5, m1
+    mov%1 [linesizeq*0+destq], m2
+    mov%1 [linesizeq*1+destq], m3
+    mov%1 [linesizeq*2+destq], m4
+    mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    shl        blockd, 2               ;  4 * dc
+    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
+    sar        blockd, 5               ; >> 5
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS a
+    RET
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index d64ddf0174..45c8a68f29 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -34,7 +33,15 @@
 #include "fpel.h"
 #include "vc1dsp.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+                                   const uint8_t *src, x86_reg stride,
+                                   int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+                                   const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+                                      const int16_t *src, int rnd);
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -67,102 +74,6 @@
      "punpcklwd %%mm7, %%mm7           \n\t"    \
      "punpckldq %%mm7, %%mm7           \n\t"
 
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3)           \
-    "paddw     %%mm"#R2", %%mm"#R1"    \n\t"    \
-    "movd      (%0,%3), %%mm"#R0"      \n\t"    \
-    "pmullw    %%mm6, %%mm"#R1"        \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R0"        \n\t"    \
-    "movd      (%0,%2), %%mm"#R3"      \n\t"    \
-    "psubw     %%mm"#R0", %%mm"#R1"    \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R3"        \n\t"    \
-    "paddw     %%mm7, %%mm"#R1"        \n\t"    \
-    "psubw     %%mm"#R3", %%mm"#R1"    \n\t"    \
-    "psraw     %4, %%mm"#R1"           \n\t"    \
-    "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \
-    "add       %2, %0                  \n\t"
-
-/** Sacrificing mm6 allows to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
-                                       const uint8_t *src, x86_reg stride,
-                                       int rnd, int64_t shift)
-{
-    __asm__ volatile(
-        "mov       $3, %%"FF_REG_c"        \n\t"
-        LOAD_ROUNDER_MMX("%5")
-        "movq      "MANGLE(ff_pw_9)", %%mm6 \n\t"
-        "1:                                \n\t"
-        "movd      (%0), %%mm2             \n\t"
-        "add       %2, %0                  \n\t"
-        "movd      (%0), %%mm3             \n\t"
-        "punpcklbw %%mm0, %%mm2            \n\t"
-        "punpcklbw %%mm0, %%mm3            \n\t"
-        SHIFT2_LINE(  0, 1, 2, 3, 4)
-        SHIFT2_LINE( 24, 2, 3, 4, 1)
-        SHIFT2_LINE( 48, 3, 4, 1, 2)
-        SHIFT2_LINE( 72, 4, 1, 2, 3)
-        SHIFT2_LINE( 96, 1, 2, 3, 4)
-        SHIFT2_LINE(120, 2, 3, 4, 1)
-        SHIFT2_LINE(144, 3, 4, 1, 2)
-        SHIFT2_LINE(168, 4, 1, 2, 3)
-        "sub       %6, %0                  \n\t"
-        "add       $8, %1                  \n\t"
-        "dec       %%"FF_REG_c"            \n\t"
-        "jnz 1b                            \n\t"
-        : "+r"(src), "+r"(dst)
-        : "r"(stride), "r"(-2*stride),
-          "m"(shift), "m"(rnd), "r"(9*stride-4)
-        : "%"FF_REG_c, "memory"
-    );
-}
-
-/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
-                                             const int16_t *src, int rnd)\
-{\
-    int h = 8;\
-\
-    src -= 1;\
-    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
-    __asm__ volatile(\
-        LOAD_ROUNDER_MMX("%4")\
-        "movq      "MANGLE(ff_pw_128)", %%mm6\n\t"\
-        "movq      "MANGLE(ff_pw_9)", %%mm5 \n\t"\
-        "1:                                \n\t"\
-        "movq      2*0+0(%1), %%mm1        \n\t"\
-        "movq      2*0+8(%1), %%mm2        \n\t"\
-        "movq      2*1+0(%1), %%mm3        \n\t"\
-        "movq      2*1+8(%1), %%mm4        \n\t"\
-        "paddw     2*3+0(%1), %%mm1        \n\t"\
-        "paddw     2*3+8(%1), %%mm2        \n\t"\
-        "paddw     2*2+0(%1), %%mm3        \n\t"\
-        "paddw     2*2+8(%1), %%mm4        \n\t"\
-        "pmullw    %%mm5, %%mm3            \n\t"\
-        "pmullw    %%mm5, %%mm4            \n\t"\
-        "psubw     %%mm1, %%mm3            \n\t"\
-        "psubw     %%mm2, %%mm4            \n\t"\
-        NORMALIZE_MMX("$7")\
-        /* Remove bias */\
-        "paddw     %%mm6, %%mm3            \n\t"\
-        "paddw     %%mm6, %%mm4            \n\t"\
-        TRANSFER_DO_PACK(OP)\
-        "add       $24, %1                 \n\t"\
-        "add       %3, %2                  \n\t"\
-        "decl      %0                      \n\t"\
-        "jnz 1b                            \n\t"\
-        : "+r"(h), "+r" (src),  "+r" (dst)\
-        : "r"(stride), "m"(rnd)\
-        : "memory"\
-    );\
-}
-
-VC1_HOR_16b_SHIFT2(OP_PUT, put_)
-VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
-
-
 /**
  * Purely vertical or horizontal 1/2 shift interpolation.
  * Sacrifice mm6 for *9 factor.
@@ -213,6 +124,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
         : "+r"(src),  "+r"(dst)\
         : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
           "g"(stride-offset)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
         : "%"FF_REG_c, "memory"\
     );\
 }
@@ -315,6 +227,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(src_stride), "r"(3*src_stride),                           \
           "m"(rnd), "m"(shift)                                          \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -352,6 +265,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(stride), "m"(rnd)                                         \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -387,6 +301,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
+          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -420,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
  * @param  hmode   Vertical filter.
  * @param  rnd     Rounding bias.
  */
-#define VC1_MSPEL_MC(OP)\
+#define VC1_MSPEL_MC(OP, INSTR)\
 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
                                int hmode, int vmode, int rnd)\
 {\
     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
-         { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
-         { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
+         { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
          { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
 \
@@ -441,7 +356,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
             static const int shift_value[] = { 0, 5, 1, 5 };\
             int              shift = (shift_value[hmode]+shift_value[vmode])>>1;\
             int              r;\
-            DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
+            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
 \
             r = (1<<(shift-1)) + rnd-1;\
             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
@@ -457,10 +372,19 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
 \
     /* Horizontal mode with no vertical mode */\
     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+                                  int stride, int hmode, int vmode, int rnd)\
+{ \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+    dst += 8*stride; src += 8*stride; \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
 }
 
-VC1_MSPEL_MC(put_)
-VC1_MSPEL_MC(avg_)
+VC1_MSPEL_MC(put_, mmx)
+VC1_MSPEL_MC(avg_, mmxext)
 
 /** Macro to ease bicubic filter interpolation functions declarations */
 #define DECLARE_FUNCTION(a, b)                                          \
@@ -477,6 +401,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \
                                                   int rnd)              \
 {                                                                       \
      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride,  \
+                                                     int rnd)           \
+{                                                                       \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
 }
 
 DECLARE_FUNCTION(0, 1)
@@ -498,261 +436,51 @@ DECLARE_FUNCTION(3, 1)
 DECLARE_FUNCTION(3, 2)
 DECLARE_FUNCTION(3, 3)
 
-static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (17 * dc +  4) >> 3;
-    dc = (17 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (17 * dc +  4) >> 3;
-    dc = (12 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-    dest += 4 * stride;
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = ( 3 * dc +  1) >> 1;
-    dc = (17 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (3 * dc +  1) >> 1;
-    dc = (3 * dc + 16) >> 5;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-    dest += 4 * stride;
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t stride, int rnd)
-{
-    ff_put_pixels8_mmx(dst, src, stride, 8);
-}
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
-    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+    FN_ASSIGN(put_, 0, 1, _mmx);
+    FN_ASSIGN(put_, 0, 2, _mmx);
+    FN_ASSIGN(put_, 0, 3, _mmx);
+
+    FN_ASSIGN(put_, 1, 0, _mmx);
+    FN_ASSIGN(put_, 1, 1, _mmx);
+    FN_ASSIGN(put_, 1, 2, _mmx);
+    FN_ASSIGN(put_, 1, 3, _mmx);
+
+    FN_ASSIGN(put_, 2, 0, _mmx);
+    FN_ASSIGN(put_, 2, 1, _mmx);
+    FN_ASSIGN(put_, 2, 2, _mmx);
+    FN_ASSIGN(put_, 2, 3, _mmx);
+
+    FN_ASSIGN(put_, 3, 0, _mmx);
+    FN_ASSIGN(put_, 3, 1, _mmx);
+    FN_ASSIGN(put_, 3, 2, _mmx);
+    FN_ASSIGN(put_, 3, 3, _mmx);
 }
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
-    dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
-
-    dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
-    dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
-    dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
-    dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
+    FN_ASSIGN(avg_, 0, 1, _mmxext);
+    FN_ASSIGN(avg_, 0, 2, _mmxext);
+    FN_ASSIGN(avg_, 0, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 1, 0, _mmxext);
+    FN_ASSIGN(avg_, 1, 1, _mmxext);
+    FN_ASSIGN(avg_, 1, 2, _mmxext);
+    FN_ASSIGN(avg_, 1, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 2, 0, _mmxext);
+    FN_ASSIGN(avg_, 2, 1, _mmxext);
+    FN_ASSIGN(avg_, 2, 2, _mmxext);
+    FN_ASSIGN(avg_, 2, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 3, 0, _mmxext);
+    FN_ASSIGN(avg_, 3, 1, _mmxext);
+    FN_ASSIGN(avg_, 3, 2, _mmxext);
+    FN_ASSIGN(avg_, 3, 3, _mmxext);
 }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index b22e0fec8b..e237860700 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
 ;* Core video DSP functions
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -54,13 +54,13 @@ SECTION .text
 ; |    |    <- bottom is copied from last line in body of source
 ; '----' <- bh
 %if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
                                 start_y, end_y, bh, w
 %else ; x86-32
 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
 %define src_strideq r3mp
-%define dst_strideq r2mp
-    mov            srcq, r1mp
+%define dst_strideq r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
@@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     neg        n_wordsq
     lea        start_xq, [start_xq+n_wordsq*2]
 .y_loop:                                        ; do {
-    ; FIXME also write a ssse3 version using pshufb
+%if cpuflag(avx2)
+    vpbroadcastb     m0, [dstq+start_xq]
+    mov              wq, n_wordsq               ;   initialize w
+%else
     movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
     imul             wd, 0x01010101             ;   w *= 0x01010101
     movd             m0, wd
@@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
 %else ; mmx
     punpckldq        m0, m0                     ;   splat
 %endif ; mmx/sse
+%endif ; avx2
 .x_loop:                                        ;   do {
     movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
     add              wq, mmsize/2               ;     w -= $mmsize/2
@@ -127,6 +131,11 @@ hvar_fn
 INIT_XMM sse2
 hvar_fn
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
 ; macro to read/write a horizontal number of pixels (%2) to/from registers
 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
@@ -184,10 +193,10 @@ hvar_fn
     mov            valb, [srcq+%2-1]
 %elif (%2-%%off) == 2
     mov            valw, [srcq+%2-2]
-%elifidn %1, body
-    mov            vald, [srcq+%2-3]
 %else
-    movd mm %+ %%mmx_idx, [srcq+%2-3]
+    mov            valb, [srcq+%2-1]
+    ror            vald, 16
+    mov            valw, [srcq+%2-3]
 %endif
 %endif ; (%2-%%off) >= 1
 %endmacro ; READ_NUM_BYTES
@@ -240,15 +249,13 @@ hvar_fn
     mov     [dstq+%2-1], valb
 %elif (%2-%%off) == 2
     mov     [dstq+%2-2], valw
-%elifidn %1, body
-    mov     [dstq+%2-3], valw
-    shr            vald, 16
-    mov     [dstq+%2-1], valb
 %else
-    movd           vald, mm %+ %%mmx_idx
     mov     [dstq+%2-3], valw
-    shr            vald, 16
+    ror            vald, 16
     mov     [dstq+%2-1], valb
+%ifnidn %1, body
+    ror            vald, 16
+%endif
 %endif
 %endif ; (%2-%%off) >= 1
 %endmacro ; WRITE_NUM_BYTES
@@ -262,30 +269,30 @@ hvar_fn
 %rep 1+%2-%1
 %if %%n <= 3
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, val, bh
     mov             bhq, r6mp                   ; r6mp = bhmp
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
     mov            dstq, r0mp
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %else
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, bh
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %endif
@@ -344,9 +351,8 @@ VERTICAL_EXTEND 16, 22
 ; obviously not the same on both sides.
 
 %macro READ_V_PIXEL 2
-%if %1 == 2
-    movzx          valw, byte %2
-    imul           valw, 0x0101
+%if cpuflag(avx2)
+    vpbroadcastb     m0, %2
 %else
     movzx          vald, byte %2
     imul           vald, 0x01010101
@@ -356,13 +362,16 @@ VERTICAL_EXTEND 16, 22
     pshufd           m0, m0, q0000
 %else
     punpckldq        m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
 %endmacro ; READ_V_PIXEL
 
 %macro WRITE_V_PIXEL 2
 %assign %%off 0
+
+%if %1 >= 8
+
 %rep %1/mmsize
     movu     [%2+%%off], m0
 %assign %%off %%off+mmsize
@@ -378,34 +387,44 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
     movq      [%2+%1-8], m0
 %assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
-    movd     [%2+%%off], m0
-%assign %%off %%off+4
 %else
-    mov      [%2+%%off], vald
+    movd     [%2+%%off], m0
 %assign %%off %%off+4
 %endif
 %endif ; %1-%%off >= 4
 
-%if %1-%%off >= 2
-%if %1 >= 8
-    movd      [%2+%1-4], m0
+%else ; %1 < 8
+
+%rep %1/4
+    mov      [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+    movd     [%2+%%off-2], m0
 %else
     mov      [%2+%%off], valw
-%endif
+%endif ; avx2
 %endif ; (%1-%%off)/2
 %endmacro ; WRITE_V_PIXEL
 
 %macro H_EXTEND 2
 %assign %%n %1
 %rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
 .loop_y:                                        ; do {
     READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
     WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
@@ -426,6 +445,11 @@ H_EXTEND 16, 22
 INIT_XMM sse2
 H_EXTEND 16, 22
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
 %macro PREFETCH_FN 1
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index 8ee837096a..26e072bb12 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
 /*
+ * Copyright (C) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
@@ -28,11 +30,11 @@
 #include "libavcodec/videodsp.h"
 
 #if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
@@ -59,7 +61,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_vfix_func *vfixtbl_mmx[22] = {
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
     &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
     &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
     &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
@@ -78,7 +80,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func *vfixtbl_sse[22] = {
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
     ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
     ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
     ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
@@ -107,7 +109,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_hfix_func *hfixtbl_mmx[11] = {
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
@@ -119,13 +121,30 @@ extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
-static emu_edge_hfix_func *hfixtbl_sse2[11] = {
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
 
 static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t dst_stride,
@@ -133,22 +152,26 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               x86_reg block_w, x86_reg block_h,
                                               x86_reg src_x, x86_reg src_y,
                                               x86_reg w, x86_reg h,
-                                              emu_edge_vfix_func **vfix_tbl,
+                                              emu_edge_vfix_func * const *vfix_tbl,
                                               emu_edge_vvar_func *v_extend_var,
-                                              emu_edge_hfix_func **hfix_tbl,
+                                              emu_edge_hfix_func * const *hfix_tbl,
                                               emu_edge_hvar_func *h_extend_var)
 {
     x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
 
     if (!w || !h)
-         return;
+        return;
+
+    av_assert2(block_w <= FFABS(dst_stride));
 
     if (src_y >= h) {
-        src  -= src_y * src_stride;
-        src_y = src_y_add = h - 1;
+        src -= src_y*src_stride;
+        src_y_add = h - 1;
+        src_y     = h - 1;
     } else if (src_y <= -block_h) {
-        src  -= src_y*src_stride;
-        src_y = src_y_add = 1 - block_h;
+        src -= src_y*src_stride;
+        src_y_add = 1 - block_h;
+        src_y     = 1 - block_h;
     }
     if (src_x >= w) {
         src   += w - 1 - src_x;
@@ -162,18 +185,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y   = FFMIN(block_h, h-src_y);
     end_x   = FFMIN(block_w, w-src_x);
-    assert(start_x < end_x && block_w > 0);
-    assert(start_y < end_y && block_h > 0);
+    av_assert2(start_x < end_x && block_w > 0);
+    av_assert2(start_y < end_y && block_h > 0);
 
     // fill in the to-be-copied part plus all above/below
     src += (src_y_add + start_y) * src_stride + start_x;
     w = end_x - start_x;
     if (w <= 22) {
-        vfix_tbl[w - 1](dst + start_x, src,
-                        dst_stride, src_stride,
+        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                         start_y, end_y, block_h);
     } else {
-        v_extend_var(dst + start_x, src, dst_stride, src_stride,
+        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                      start_y, end_y, block_h, w);
     }
 
@@ -212,7 +234,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
                      hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
 }
 
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
@@ -231,10 +253,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                               int src_x, int src_y, int w,
                                               int h)
 {
-    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
-                     src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_YASM */
 
 void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
@@ -264,5 +300,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
     if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
         ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+    }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650eef5..b25d838868 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
 ;* Vorbis x86 optimizations
 ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index bbd83195cc..bc1cc43a18 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 8587741f95..d88d5a1edf 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,6 +40,7 @@ pb_81: times 8 db 0x81
 cextern pb_1
 cextern pb_3
 cextern pb_80
+cextern pb_FE
 
 cextern pw_8
 
@@ -141,6 +142,49 @@ cglobal vp3_h_loop_filter, 3, 4
     STORE_4_WORDS m3
     RET
 
+%macro PAVGB_NO_RND 0
+    mova   m4, m0
+    mova   m5, m2
+    pand   m4, m1
+    pand   m5, m3
+    pxor   m1, m0
+    pxor   m3, m2
+    pand   m1, m6
+    pand   m3, m6
+    psrlq  m1, 1
+    psrlq  m3, 1
+    paddb  m4, m1
+    paddb  m5, m3
+%endmacro
+
+INIT_MMX mmx
+cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
+    mova   m6, [pb_FE]
+    lea    stride3q,[strideq+strideq*2]
+.loop:
+    mova   m0, [src1q]
+    mova   m1, [src2q]
+    mova   m2, [src1q+strideq]
+    mova   m3, [src2q+strideq]
+    PAVGB_NO_RND
+    mova   [dstq], m4
+    mova   [dstq+strideq], m5
+
+    mova   m0, [src1q+strideq*2]
+    mova   m1, [src2q+strideq*2]
+    mova   m2, [src1q+stride3q]
+    mova   m3, [src2q+stride3q]
+    PAVGB_NO_RND
+    mova   [dstq+strideq*2], m4
+    mova   [dstq+stride3q],  m5
+
+    lea    src1q, [src1q+strideq*4]
+    lea    src2q, [src2q+strideq*4]
+    lea    dstq,  [dstq+strideq*4]
+    sub    hd, 4
+    jnz .loop
+    RET
+
 ; from original comments: The Macro does IDct on 4 1-D Dcts
 %macro BeginIDCT 0
     movq          m2, I(3)
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 043e10f720..1ba9576431 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +25,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
-#include "config.h"
 
 void ff_vp3_idct_put_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -38,16 +39,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
 void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
                                  int *bounding_values);
 
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+                                     const uint8_t *b, ptrdiff_t stride,
+                                     int h);
+
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
         c->idct_put  = ff_vp3_idct_put_mmx;
         c->idct_add  = ff_vp3_idct_add_mmx;
-    }
 #endif
+    }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a693684af..810cc8dcd8 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,49 +4,46 @@
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (C) 2010  Eli Friedman
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_X86_VP56_ARITH_H
 #define AVCODEC_X86_VP56_ARITH_H
 
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
 #define vp56_rac_get_prob vp56_rac_get_prob
 static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
 {
     unsigned int code_word = vp56_rac_renorm(c);
-    unsigned int high = c->high;
-    unsigned int low = 1 + (((high - 1) * prob) >> 8);
+    unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
     unsigned int low_shift = low << 16;
     int bit = 0;
+    c->code_word = code_word;
 
     __asm__(
         "subl  %4, %1      \n\t"
         "subl  %3, %2      \n\t"
-        "leal (%2, %3), %3 \n\t"
         "setae %b0         \n\t"
         "cmovb %4, %1      \n\t"
-        "cmovb %3, %2      \n\t"
-        : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
-        : "r"(low)
+        "cmovb %5, %2      \n\t"
+        : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+        : "r"(low_shift), "r"(low), "r"(code_word)
     );
 
-    c->high      = high;
-    c->code_word = code_word;
     return bit;
 }
 #endif
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index b667d38aea..0be531e5c2 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
 ;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index 6d98db19d4..ce498931d0 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
  * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index b0f6b83767..e303b80293 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -906,6 +906,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
     %4 [dst2q+strideq+%3], m5
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
     ; load data
@@ -929,8 +930,9 @@ cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
     lea     dst2q, [dst1q+strideq*2]
     ADD_DC     m0, m1, 0, movh
     RET
+%endif
 
-INIT_XMM sse4
+%macro VP8_IDCT_DC_ADD 0
 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
     ; load data
     movd       m0, [blockq]
@@ -956,10 +958,25 @@ cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
     paddw      m4, m0
     packuswb   m2, m4
     movd   [dst1q], m2
+%if cpuflag(sse4)
     pextrd [dst1q+strideq], m2, 1
     pextrd [dst2q], m2, 2
     pextrd [dst2q+strideq], m2, 3
+%else
+    psrldq     m2, 4
+    movd [dst1q+strideq], m2
+    psrldq     m2, 4
+    movd [dst2q], m2
+    psrldq     m2, 4
+    movd [dst2q+strideq], m2
+%endif
     RET
+%endmacro
+
+INIT_XMM sse2
+VP8_IDCT_DC_ADD
+INIT_XMM sse4
+VP8_IDCT_DC_ADD
 
 ;-----------------------------------------------------------------------------
 ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 3e84bed424..814bf69382 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -168,7 +168,7 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
     uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
     src -= srcstride * (TAPNUMY / 2 - 1); \
     ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
@@ -213,7 +213,7 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
     ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
         tmp, SIZE,      src, srcstride, height + 1, mx, my); \
     ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
@@ -233,6 +233,8 @@ HVBILIN(ssse3, 8, 16, 16)
 
 void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
                             ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
+                             ptrdiff_t stride);
 void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
                              ptrdiff_t stride);
 void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
@@ -346,7 +348,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
-    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+    if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -370,9 +372,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
         c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
 #if ARCH_X86_32
+        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
         c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
         c->vp8_idct_add       = ff_vp8_idct_add_mmx;
         c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;
@@ -416,7 +418,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
     }
 
-    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+    if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
 
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -427,9 +429,10 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse2;
         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
 
-        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
 
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -454,7 +457,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
 
         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 9ffd83a39b..caeb405267 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 59cde79d89..66363d46ad 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,221 +1,124 @@
 /*
  * VP9 SIMD optimizations
  *
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
 
 #if HAVE_YASM
 
-#define fpel_func(avg, sz, opt)                                                 \
-void ff_vp9_ ## avg ## sz ## _ ## opt(uint8_t *dst, ptrdiff_t dst_stride,       \
-                                      const uint8_t *src, ptrdiff_t src_stride, \
-                                      int h, int mx, int my)
-
-fpel_func(put,  4, mmx);
-fpel_func(put,  8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(avg,  4, mmxext);
-fpel_func(avg,  8, mmxext);
-fpel_func(avg, 16, sse2);
-fpel_func(avg, 32, sse2);
-fpel_func(avg, 64, sse2);
-fpel_func(put, 32, avx);
-fpel_func(put, 64, avx);
-fpel_func(avg, 32, avx2);
-fpel_func(avg, 64, avx2);
-#undef fpel_func
-
-#define mc_func(avg, sz, dir, opt, type, f_sz)                                  \
-void                                                                            \
-ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                          ptrdiff_t dst_stride, \
-                                                          const uint8_t *src,   \
-                                                          ptrdiff_t src_stride, \
-                                                          int h,                \
-                                                          const type (*filter)[f_sz])
-
-#define mc_funcs(sz, opt, type, f_sz)     \
-    mc_func(put, sz, h, opt, type, f_sz); \
-    mc_func(avg, sz, h, opt, type, f_sz); \
-    mc_func(put, sz, v, opt, type, f_sz); \
-    mc_func(avg, sz, v, opt, type, f_sz)
-
-mc_funcs(4, mmxext, int16_t,  8);
-mc_funcs(8, sse2,   int16_t,  8);
-mc_funcs(4, ssse3,  int8_t,  32);
-mc_funcs(8, ssse3,  int8_t,  32);
+decl_fpel_func(put,  4,   , mmx);
+decl_fpel_func(put,  8,   , mmx);
+decl_fpel_func(put, 16,   , sse);
+decl_fpel_func(put, 32,   , sse);
+decl_fpel_func(put, 64,   , sse);
+decl_fpel_func(avg,  4, _8, mmxext);
+decl_fpel_func(avg,  8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32,   , avx);
+decl_fpel_func(put, 64,   , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t,  8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
 #if ARCH_X86_64
-mc_funcs(16, ssse3, int8_t,  32);
-mc_funcs(32, avx2,  int8_t,  32);
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
 #endif
 
-#undef mc_funcs
-#undef mc_func
-
-#define mc_rep_func(avg, sz, hsz, dir, opt, type, f_sz)                     \
-static av_always_inline void                                                \
-ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,     \
-                                                      ptrdiff_t dst_stride, \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const type (*filter)[f_sz]) \
-{                                                                           \
-    ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst,         \
-                                                           dst_stride,      \
-                                                           src,             \
-                                                           src_stride,      \
-                                                           h,               \
-                                                           filter);         \
-    ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz,   \
-                                                           dst_stride,      \
-                                                           src + hsz,       \
-                                                           src_stride,      \
-                                                           h, filter);      \
-}
-
-#define mc_rep_funcs(sz, hsz, opt, type, f_sz)     \
-    mc_rep_func(put, sz, hsz, h, opt, type, f_sz)  \
-    mc_rep_func(avg, sz, hsz, h, opt, type, f_sz)  \
-    mc_rep_func(put, sz, hsz, v, opt, type, f_sz)  \
-    mc_rep_func(avg, sz, hsz, v, opt, type, f_sz)
-
-mc_rep_funcs(16, 8,  sse2,  int16_t,  8)
+mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
 #if ARCH_X86_32
-mc_rep_funcs(16, 8,  ssse3, int8_t,  32)
+mc_rep_funcs(16,  8,  8, ssse3, int8_t,  32, 8)
 #endif
-mc_rep_funcs(32, 16, sse2,  int16_t,  8)
-mc_rep_funcs(32, 16, ssse3, int8_t,  32)
-mc_rep_funcs(64, 32, sse2,  int16_t,  8)
-mc_rep_funcs(64, 32, ssse3, int8_t,  32)
+mc_rep_funcs(32, 16, 16, sse2,  int16_t,  8, 8)
+mc_rep_funcs(32, 16, 16, ssse3, int8_t,  32, 8)
+mc_rep_funcs(64, 32, 32, sse2,  int16_t,  8, 8)
+mc_rep_funcs(64, 32, 32, ssse3, int8_t,  32, 8)
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-mc_rep_funcs(64, 32, avx2,  int8_t,  32)
+mc_rep_funcs(64, 32, 32, avx2,  int8_t,  32, 8)
 #endif
 
-#undef mc_rep_funcs
-#undef mc_rep_func
-
 extern const int8_t ff_filters_ssse3[3][15][4][32];
 extern const int16_t ff_filters_sse2[3][15][8][8];
 
-#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, opt)                   \
-static void                                                                      \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ ## opt(uint8_t *dst,                     \
-                                               ptrdiff_t dst_stride,             \
-                                               const uint8_t *src,               \
-                                               ptrdiff_t src_stride,             \
-                                               int h, int mx, int my)            \
-{                                                                                \
-    LOCAL_ALIGNED_ ## align(uint8_t, temp, [71 * 64]);                           \
-    ff_vp9_put_8tap_1d_h_ ## sz ## _ ## opt(temp, 64,                            \
-                                            src - 3 * src_stride,                \
-                                            src_stride, h + 7,                   \
-                                            ff_filters_ ## f_opt[f][mx - 1]);    \
-    ff_vp9_ ## op ## _8tap_1d_v_ ## sz ## _ ## opt(dst, dst_stride,              \
-                                                   temp + 3 * 64, 64, h,         \
-                                                   ff_filters_ ## f_opt[f][my - 1]); \
-}
-
-#define filters_8tap_2d_fn(op, sz, align, opt, f_opt)                          \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, opt) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   align, opt) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  align, opt)
-
-#define filters_8tap_2d_fn2(op, align, opt4, opt8, f_opt) \
-    filters_8tap_2d_fn(op, 64, align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 32, align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 16, align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 8,  align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 4,  align, opt4, f_opt)
-
-
-filters_8tap_2d_fn2(put, 16, mmxext, sse2, sse2)
-filters_8tap_2d_fn2(avg, 16, mmxext, sse2, sse2)
-filters_8tap_2d_fn2(put, 16, ssse3, ssse3, ssse3)
-filters_8tap_2d_fn2(avg, 16, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-filters_8tap_2d_fn(put, 64, 32, avx2, ssse3)
-filters_8tap_2d_fn(put, 32, 32, avx2, ssse3)
-filters_8tap_2d_fn(avg, 64, 32, avx2, ssse3)
-filters_8tap_2d_fn(avg, 32, 32, avx2, ssse3)
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
 #endif
 
-#undef filters_8tap_2d_fn2
-#undef filters_8tap_2d_fn
-#undef filter_8tap_2d_fn
-
-#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, opt)         \
-static void                                                                \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ ## opt(uint8_t *dst,          \
-                                                    ptrdiff_t dst_stride,  \
-                                                    const uint8_t *src,    \
-                                                    ptrdiff_t src_stride,  \
-                                                    int h, int mx,         \
-                                                    int my)                \
-{                                                                          \
-    ff_vp9_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(dst,          \
-                                                             dst_stride,   \
-                                                             src,          \
-                                                             src_stride, h,\
-                                                             ff_filters_ ## f_opt[f][dvar - 1]); \
-}
-
-#define filters_8tap_1d_fn(op, sz, dir, dvar, opt, f_opt)                          \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, opt) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, opt) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, opt)
-
-#define filters_8tap_1d_fn2(op, sz, opt, f_opt)        \
-    filters_8tap_1d_fn(op, sz, h, mx, opt, f_opt)      \
-    filters_8tap_1d_fn(op, sz, v, my, opt, f_opt)
-
-#define filters_8tap_1d_fn3(op, opt4, opt8, f_opt) \
-    filters_8tap_1d_fn2(op, 64, opt8, f_opt) \
-    filters_8tap_1d_fn2(op, 32, opt8, f_opt) \
-    filters_8tap_1d_fn2(op, 16, opt8, f_opt) \
-    filters_8tap_1d_fn2(op,  8, opt8, f_opt) \
-    filters_8tap_1d_fn2(op,  4, opt4, f_opt)
-
-filters_8tap_1d_fn3(put, mmxext, sse2, sse2)
-filters_8tap_1d_fn3(avg, mmxext, sse2, sse2)
-filters_8tap_1d_fn3(put, ssse3, ssse3, ssse3)
-filters_8tap_1d_fn3(avg, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-filters_8tap_1d_fn2(put, 64, avx2, ssse3)
-filters_8tap_1d_fn2(put, 32, avx2, ssse3)
-filters_8tap_1d_fn2(avg, 64, avx2, ssse3)
-filters_8tap_1d_fn2(avg, 32, avx2, ssse3)
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
 #endif
 
-#undef filters_8tap_1d_fn
-#undef filters_8tap_1d_fn2
-#undef filters_8tap_1d_fn3
-#undef filter_8tap_1d_fn
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                            int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct,  idct,  size, opt); \
+itxfm_func(iadst, idct,  size, opt); \
+itxfm_func(idct,  iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct,  idct,  4, mmxext);
+itxfm_func(idct,  iadst, 4, sse2);
+itxfm_func(iadst, idct,  4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+itxfm_funcs(16, avx2);
+itxfm_func(idct, idct, 32, avx2);
+
+#undef itxfm_func
+#undef itxfm_funcs
 
 #define lpf_funcs(size1, size2, opt) \
 void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
@@ -223,6 +126,8 @@ void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
 void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
                                                     int E, int I, int H)
 
+lpf_funcs(4, 8, mmxext);
+lpf_funcs(8, 8, mmxext);
 lpf_funcs(16, 16, sse2);
 lpf_funcs(16, 16, ssse3);
 lpf_funcs(16, 16, avx);
@@ -241,42 +146,88 @@ lpf_funcs(88, 16, avx);
 
 #undef lpf_funcs
 
-#endif /* HAVE_YASM */
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                   const uint8_t *l, const uint8_t *a)
 
-av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
-{
-#if HAVE_YASM
-    int cpu_flags = av_get_cpu_flags();
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
 
-#define init_fpel(idx1, idx2, sz, type, opt)                            \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_ ## type ## sz ## _ ## opt
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
 
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
 
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _ ## opt
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
 
-#define init_subpel2(idx1, idx2, sz, type, opt) \
-    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, opt); \
-    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, opt); \
-    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, opt)
+ipred_func(4, h, sse2);
 
-#define init_subpel3_32_64(idx, type, opt) \
-    init_subpel2(0, idx, 64, type, opt); \
-    init_subpel2(1, idx, 32, type, opt)
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
 
-#define init_subpel3_8to64(idx, type, opt) \
-    init_subpel3_32_64(idx, type, opt); \
-    init_subpel2(2, idx, 16, type, opt); \
-    init_subpel2(3, idx,  8, type, opt)
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
 
-#define init_subpel3(idx, type, opt) \
-    init_subpel3_8to64(idx, type, opt); \
-    init_subpel2(4, idx,  4, type, opt)
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
+
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
+{
+#if HAVE_YASM
+    int cpu_flags;
+
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
+        return;
+    }
+
+    cpu_flags = av_get_cpu_flags();
 
 #define init_lpf(opt) do { \
     dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
@@ -291,53 +242,169 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
     dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
 } while (0)
 
+#define init_ipred(sz, opt, t, e) \
+    dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+    init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+    init_ipred(sz, opt, hd, HOR_DOWN); \
+    init_ipred(sz, opt, vl, VERT_LEFT); \
+    init_ipred(sz, opt, hu, HOR_UP); \
+    init_ipred(sz, opt, tm, TM_VP8); \
+    init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+    init_dir_tm_ipred(sz, opt); \
+    init_ipred(sz, opt, h,  HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dc,      DC); \
+    init_ipred(sz, opt, dc_left, LEFT_DC); \
+    init_ipred(sz, opt, dc_top,  TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+    init_dc_ipred(sz, opt); \
+    init_dir_tm_h_ipred(sz, opt); \
+} while (0)
+
     if (EXTERNAL_MMX(cpu_flags)) {
-        init_fpel(4, 0,  4, put, mmx);
-        init_fpel(3, 0,  8, put, mmx);
+        init_fpel_func(4, 0,  4, put, , mmx);
+        init_fpel_func(3, 0,  8, put, , mmx);
+        if (!bitexact) {
+            dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+            dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+            dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+            dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+        }
+        init_ipred(8, mmx, v, VERT);
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-        init_subpel2(4, 0, 4, put, mmxext);
-        init_subpel2(4, 1, 4, avg, mmxext);
-        init_fpel(4, 1,  4, avg, mmxext);
-        init_fpel(3, 1,  8, avg, mmxext);
+        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
+        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
+        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
+        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
+        init_subpel2(4, 0, 4, put, 8, mmxext);
+        init_subpel2(4, 1, 4, avg, 8, mmxext);
+        init_fpel_func(4, 1,  4, avg, _8, mmxext);
+        init_fpel_func(3, 1,  8, avg, _8, mmxext);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+        init_dc_ipred(4, mmxext);
+        init_dc_ipred(8, mmxext);
+        init_dir_tm_ipred(4, mmxext);
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
-        init_fpel(2, 0, 16, put, sse);
-        init_fpel(1, 0, 32, put, sse);
-        init_fpel(0, 0, 64, put, sse);
+        init_fpel_func(2, 0, 16, put, , sse);
+        init_fpel_func(1, 0, 32, put, , sse);
+        init_fpel_func(0, 0, 64, put, , sse);
+        init_ipred(16, sse, v, VERT);
+        init_ipred(32, sse, v, VERT);
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        init_subpel3_8to64(0, put, sse2);
-        init_subpel3_8to64(1, avg, sse2);
-        init_fpel(2, 1, 16, avg, sse2);
-        init_fpel(1, 1, 32, avg, sse2);
-        init_fpel(0, 1, 64, avg, sse2);
+        init_subpel3_8to64(0, put, 8, sse2);
+        init_subpel3_8to64(1, avg, 8, sse2);
+        init_fpel_func(2, 1, 16, avg,  _8, sse2);
+        init_fpel_func(1, 1, 32, avg,  _8, sse2);
+        init_fpel_func(0, 1, 64, avg,  _8, sse2);
         init_lpf(sse2);
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+        init_dc_ipred(16, sse2);
+        init_dc_ipred(32, sse2);
+        init_dir_tm_h_ipred(8, sse2);
+        init_dir_tm_h_ipred(16, sse2);
+        init_dir_tm_h_ipred(32, sse2);
+        init_ipred(4, sse2, h, HOR);
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
-        init_subpel3(0, put, ssse3);
-        init_subpel3(1, avg, ssse3);
+        init_subpel3(0, put, 8, ssse3);
+        init_subpel3(1, avg, 8, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
         init_lpf(ssse3);
+        init_all_ipred(4, ssse3);
+        init_all_ipred(8, ssse3);
+        init_all_ipred(16, ssse3);
+        init_all_ipred(32, ssse3);
     }
 
     if (EXTERNAL_AVX(cpu_flags)) {
-        init_fpel(1, 0, 32, put, avx);
-        init_fpel(0, 0, 64, put, avx);
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
         init_lpf(avx);
+        init_dir_tm_h_ipred(8, avx);
+        init_dir_tm_h_ipred(16, avx);
+        init_dir_tm_h_ipred(32, avx);
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(1, 0, 32, put, , avx);
+        init_fpel_func(0, 0, 64, put, , avx);
+        init_ipred(32, avx, v, VERT);
     }
 
-    if (EXTERNAL_AVX2(cpu_flags)) {
-        init_fpel(1, 1, 32, avg, avx2);
-        init_fpel(0, 1, 64, avg, avx2);
-
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(1, 1, 32, avg, _8, avx2);
+        init_fpel_func(0, 1, 64, avg, _8, avx2);
+        if (ARCH_X86_64) {
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-        init_subpel3_32_64(0, put, avx2);
-        init_subpel3_32_64(1, avg, avx2);
-#endif /* ARCH_X86_64 && HAVE_AVX2_EXTERNAL */
+            dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
+            dsp->itxfm_add[TX_32X32][ADST_ADST] =
+            dsp->itxfm_add[TX_32X32][ADST_DCT] =
+            dsp->itxfm_add[TX_32X32][DCT_ADST] =
+            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
+            init_subpel3_32_64(0, put, 8, avx2);
+            init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+        }
+        init_dc_ipred(32, avx2);
+        init_ipred(32, avx2, h,  HOR);
+        init_ipred(32, avx2, tm, TM_VP8);
     }
 
 #undef init_fpel
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 0000000000..e410cab3a1
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+// hack to force-expand BPC
+#define cat(a, bpp, b) a##bpp##b
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                         const uint8_t *src, ptrdiff_t src_stride, \
+                                                         int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+                                                       ptrdiff_t stride, \
+                                                       const uint8_t *l, \
+                                                       const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type,  4, bpp, opt4); \
+decl_ipred_fn(type,  8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
+void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
+                                                                         ptrdiff_t stride, \
+                                                                         int16_t *block, \
+                                                                         int eob)
+
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct,  idct,  size, bpp, opt); \
+decl_itxfm_func(iadst, idct,  size, bpp, opt); \
+decl_itxfm_func(idct,  iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                    const uint8_t *src, ptrdiff_t src_stride, \
+                                                    int h, const type (*filter)[f_sz]) \
+{ \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst,        dst_stride, src, \
+                                                         src_stride, h, filter); \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+                                                         src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                          const uint8_t *src, ptrdiff_t src_stride, \
+                                                          int h, int mx, int my) \
+{ \
+    ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+                                                       h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my) \
+{ \
+    LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+    ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+                                              src_stride,  h + 7, \
+                                              ff_filters_##f_opt[f][mx - 1]); \
+    ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+                                                 64 * bytes, h, \
+                                                 ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+        type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+        type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = \
+        type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, bpp, opt); \
+    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+    init_subpel2(0, idx, 64, type, bpp, opt); \
+    init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+    init_subpel3_32_64(idx, type, bpp, opt); \
+    init_subpel2(2, idx, 16, type, bpp, opt); \
+    init_subpel2(3, idx,  8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+    init_subpel3_8to64(idx, type, bpp, opt); \
+    init_subpel2(4, idx,  4, type, bpp, opt)
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  8, bpp, opt); \
+    init_ipred_func(type, enum, 16, bpp, opt); \
+    init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  4, bpp, opt); \
+    init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libavcodec/x86/vp9dsp_init_10bpp.c b/libavcodec/x86/vp9dsp_init_10bpp.c
new file mode 100644
index 0000000000..2694c06cb2
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_10bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_12bpp.c b/libavcodec/x86/vp9dsp_init_12bpp.c
new file mode 100644
index 0000000000..5da3bc1840
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_12bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 0000000000..4576ff1692
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,141 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+decl_fpel_func(put,   8,    , mmx);
+decl_fpel_func(avg,   8, _16, mmxext);
+decl_fpel_func(put,  16,    , sse);
+decl_fpel_func(put,  32,    , sse);
+decl_fpel_func(put,  64,    , sse);
+decl_fpel_func(put, 128,    , sse);
+decl_fpel_func(avg,  16, _16, sse2);
+decl_fpel_func(avg,  32, _16, sse2);
+decl_fpel_func(avg,  64, _16, sse2);
+decl_fpel_func(avg, 128, _16, sse2);
+decl_fpel_func(put,  32,    , avx);
+decl_fpel_func(put,  64,    , avx);
+decl_fpel_func(put, 128,    , avx);
+decl_fpel_func(avg,  32, _16, avx2);
+decl_fpel_func(avg,  64, _16, avx2);
+decl_fpel_func(avg, 128, _16, avx2);
+
+decl_ipred_fns(v,       16, mmx,    sse);
+decl_ipred_fns(h,       16, mmxext, sse2);
+decl_ipred_fns(dc,      16, mmxext, sse2);
+decl_ipred_fns(dc_top,  16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl,       16,     16, avx2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel_func(4, 0,   8, put, , mmx);
+        init_ipred_func(v, VERT, 4, 16, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_fpel_func(4, 1,   8, avg, _16, mmxext);
+        init_ipred_func(h, HOR, 4, 16, mmxext);
+        init_ipred_func(dc, DC, 4, 16, mmxext);
+        init_ipred_func(dc_top,  TOP_DC,  4, 16, mmxext);
+        init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel_func(3, 0,  16, put, , sse);
+        init_fpel_func(2, 0,  32, put, , sse);
+        init_fpel_func(1, 0,  64, put, , sse);
+        init_fpel_func(0, 0, 128, put, , sse);
+        init_8_16_32_ipred_funcs(v, VERT, 16, sse);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_fpel_func(3, 1,  16, avg, _16, sse2);
+        init_fpel_func(2, 1,  32, avg, _16, sse2);
+        init_fpel_func(1, 1,  64, avg, _16, sse2);
+        init_fpel_func(0, 1, 128, avg, _16, sse2);
+        init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+        init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+        init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
+        init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+        init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+        init_ipred_funcs(hu, HOR_UP, 16, sse2);
+        init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+        init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+        init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+        init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(2, 0,  32, put, , avx);
+        init_fpel_func(1, 0,  64, put, , avx);
+        init_fpel_func(0, 0, 128, put, , avx);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+        init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+        init_ipred_funcs(hu, HOR_UP, 16, avx);
+        init_ipred_funcs(hd, HOR_DOWN, 16, avx);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(2, 1,  32, avg, _16, avx2);
+        init_fpel_func(1, 1,  64, avg, _16, avx2);
+        init_fpel_func(0, 1, 128, avg, _16, avx2);
+        init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+    }
+
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
new file mode 100644
index 0000000000..4840b2844e
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -0,0 +1,240 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+extern const int16_t ff_filters_16bpp[3][15][4][16];
+
+decl_mc_funcs(4, sse2, int16_t, 16, BPC);
+decl_mc_funcs(8, sse2, int16_t, 16, BPC);
+decl_mc_funcs(16, avx2, int16_t, 16, BPC);
+
+mc_rep_funcs(16,  8, 16, sse2, int16_t, 16, BPC)
+mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC)
+#if HAVE_AVX2_EXTERNAL
+mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC)
+#endif
+
+filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
+filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
+#endif
+
+filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
+filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
+#endif
+
+#define decl_lpf_func(dir, wd, bpp, opt) \
+void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                     int E, int I, int H)
+
+#define decl_lpf_funcs(dir, wd, bpp) \
+decl_lpf_func(dir, wd, bpp, sse2); \
+decl_lpf_func(dir, wd, bpp, ssse3); \
+decl_lpf_func(dir, wd, bpp, avx)
+
+#define decl_lpf_funcs_wd(dir) \
+decl_lpf_funcs(dir,  4, BPC); \
+decl_lpf_funcs(dir,  8, BPC); \
+decl_lpf_funcs(dir, 16, BPC)
+
+decl_lpf_funcs_wd(h);
+decl_lpf_funcs_wd(v);
+
+#define lpf_16_wrapper(dir, off, bpp, opt) \
+static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                 int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst,       stride, E, I, H); \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
+}
+
+#define lpf_16_wrappers(bpp, opt) \
+lpf_16_wrapper(h, 8 * stride, bpp, opt) \
+lpf_16_wrapper(v, 16,         bpp, opt)
+
+lpf_16_wrappers(BPC, sse2)
+lpf_16_wrappers(BPC, ssse3)
+lpf_16_wrappers(BPC, avx)
+
+#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
+static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                           int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst,       stride, \
+                                                     E & 0xff, I & 0xff, H & 0xff); \
+    ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
+                                                     E >> 8,   I >> 8,   H >> 8); \
+}
+
+#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(v, 16,         wd1, wd2, bpp, opt)
+
+#define lpf_mix2_wrappers_set(bpp, opt) \
+lpf_mix2_wrappers(4, 4, bpp, opt) \
+lpf_mix2_wrappers(4, 8, bpp, opt) \
+lpf_mix2_wrappers(8, 4, bpp, opt) \
+lpf_mix2_wrappers(8, 8, bpp, opt) \
+
+lpf_mix2_wrappers_set(BPC, sse2)
+lpf_mix2_wrappers_set(BPC, ssse3)
+lpf_mix2_wrappers_set(BPC, avx)
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
+
+decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
+decl_itxfm_funcs(4, BPC, ssse3);
+#else
+decl_itxfm_func(idct,  idct,  4, BPC, sse2);
+#endif
+decl_itxfm_func(idct,  iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct,  4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(8, BPC, sse2);
+decl_itxfm_funcs(16, BPC, sse2);
+decl_itxfm_func(idct,  idct, 32, BPC, sse2);
+#endif /* HAVE_YASM */
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
+#define init_lpf_16_func(idx, dir, bpp, opt) \
+    dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
+#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
+
+#define init_lpf_funcs(bpp, opt) \
+    init_lpf_8_func(0, 0, h,  4, bpp, opt); \
+    init_lpf_8_func(0, 1, v,  4, bpp, opt); \
+    init_lpf_8_func(1, 0, h,  8, bpp, opt); \
+    init_lpf_8_func(1, 1, v,  8, bpp, opt); \
+    init_lpf_8_func(2, 0, h, 16, bpp, opt); \
+    init_lpf_8_func(2, 1, v, 16, bpp, opt); \
+    init_lpf_16_func(0, h, bpp, opt); \
+    init_lpf_16_func(1, v, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+
+#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
+    dsp->itxfm_add[idxa][idxb] = \
+        cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt);
+#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   idct,  idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  idct,  iadst, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  iadst, idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+        if (!bitexact) {
+            init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+            init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
+#endif
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_subpel3(0, put, BPC, sse2);
+        init_subpel3(1, avg, BPC, sse2);
+        init_lpf_funcs(BPC, sse2);
+        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_func(TX_4X4, ADST_DCT,  idct,  iadst, 4, 10, sse2);
+            init_itx_func(TX_4X4, DCT_ADST,  iadst, idct,  4, 10, sse2);
+            init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+        }
+#else
+        init_itx_funcs(TX_4X4, 4, 12, sse2);
+#endif
+        init_itx_funcs(TX_8X8, 8, BPC, sse2);
+        init_itx_funcs(TX_16X16, 16, BPC, sse2);
+        init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_lpf_funcs(BPC, ssse3);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_funcs(TX_4X4, 4, BPC, ssse3);
+        }
+#endif
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        init_lpf_funcs(BPC, avx);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if HAVE_AVX2_EXTERNAL
+        init_subpel3_32_64(0,  put, BPC, avx2);
+        init_subpel3_32_64(1,  avg, BPC, avx2);
+        init_subpel2(2, 0, 16, put, BPC, avx2);
+        init_subpel2(2, 1, 16, avg, BPC, avx2);
+#endif
+    }
+
+#endif /* HAVE_YASM */
+
+    ff_vp9dsp_init_16bpp_x86(dsp);
+}
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000000..31f7d449fd
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+                    times 4 db 2
+                    times 4 db 1
+                    times 4 db 0
+pb_8x1_8x0:   times 8 db 1
+              times 8 db 0
+pb_8x3_8x2:   times 8 db 3
+              times 8 db 2
+pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
+              times 8 db -1
+pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
+              times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+              times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+              times 11 db 7
+pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+               times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+              times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_15x0_1xm1: times 15 db 0
+              db -1
+pb_0to2_5x3: db 0, 1, 2
+             times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+             times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+             times 2 db -1
+
+cextern pb_1
+cextern pb_2
+cextern pb_3
+cextern pb_15
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_4to8_FUNCS 0
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [lq]
+    movq                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    mova                    m2, [aq]
+    mova                    m3, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m4, m4
+    psadbw                  m0, m4
+    psadbw                  m1, m4
+    psadbw                  m2, m4
+    psadbw                  m3, m4
+    paddw                   m0, m1
+    paddw                   m2, m3
+    paddw                   m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_512]
+    pshufb                  m0, m4
+%else
+    paddw                   m0, [pw_32]
+    psraw                   m0, 6
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_512]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [%2q]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_8192]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_2]
+    psraw                   m0, 2
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    mova                    m1, [%2q+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
+%if HAVE_AVX2_EXTERNAL
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_1024]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top,  a
+DC_1D_AVX2_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_YMM avx
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+; h
+
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
+%else
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0123
+    punpcklwd               m0, m0
+%endif
+    lea               stride3q, [strideq*3]
+    movd      [dstq+strideq*0], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*1], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*2], m0
+    psrldq                  m0, 4
+    movd      [dstq+stride3q ], m0
+    RET
+%endif
+
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m2, [pb_8x1_8x0]
+    mova                    m3, [pb_8x3_8x2]
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 1
+.loop:
+    movd                    m0, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, m3
+    pshufb                  m0, m2
+%else
+    punpcklbw               m0, m0
+    punpcklwd               m0, m0
+    pshufd                  m1, m0, q2233
+    pshufd                  m0, m0, q0011
+%endif
+    movq      [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m1
+    movq      [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 3
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                   xm3, [lq+cntq*4]
+    vinserti128             m3, m3, xm3, 1
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; tm
+
+%macro TM_MMX_FUNCS 0
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+    pxor                    m1, m1
+    movd                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 1
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m4, m2, q1111
+    pshufw                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m4
+    packuswb                m2, m2
+    movd      [dstq+strideq*0], m4
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
+
+%macro TM_XMM_FUNCS 0
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+    pxor                    m1, m1
+    movh                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 3
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m4, m2, q1111
+    pshufd                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m2
+    movh      [dstq+strideq*0], m4
+    movhps    [dstq+strideq*1], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    mova                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m4, [pw_m256]
+    mova                    m3, [pw_m255]
+    pshufb                  m2, m4
+%else
+    punpcklbw               m2, m3
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 7
+.loop:
+    pinsrw                  m7, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m5, m7, m3
+    pshufb                  m7, m4
+%else
+    punpcklbw               m7, m3
+    punpcklwd               m7, m7
+    pshufd                  m5, m7, q1111
+    pshufd                  m7, m7, q0000
+%endif
+    paddw                   m2, m5, m0
+    paddw                   m5, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m5
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+%if ARCH_X86_64
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
+    pxor                    m5, m5
+    pinsrw                  m4, [aq-1], 0
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova                   m12, [pw_m256]
+    mova                   m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+    pshufb                  m4, pw_m256_reg
+%else
+    punpcklbw               m4, m5
+    punpcklwd               m4, m4
+    pshufd                  m4, m4, q0000
+%endif
+    punpckhbw               m1, m0,  m5
+    punpckhbw               m3, m2,  m5
+    punpcklbw               m0, m5
+    punpcklbw               m2, m5
+    psubw                   m1, m4
+    psubw                   m0, m4
+    psubw                   m3, m4
+    psubw                   m2, m4
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+    SWAP                     2, 10
+    SWAP                     3, 11
+%else
+    mova            [rsp+0*16], m0
+    mova            [rsp+1*16], m1
+    mova            [rsp+2*16], m2
+    mova            [rsp+3*16], m3
+%endif
+    mov                   cntq, 15
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m7, m3, pw_m255_reg
+    pshufb                  m3, pw_m256_reg
+%else
+    pxor                    m7, m7
+    punpcklbw               m3, m7
+    punpcklwd               m3, m3
+    pshufd                  m7, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+%if ARCH_X86_64
+    paddw                   m4, m7, m8
+    paddw                   m5, m7, m9
+    paddw                   m6, m7, m10
+    paddw                   m7, m11
+    paddw                   m0, m3, m8
+    paddw                   m1, m3, m9
+    paddw                   m2, m3, m10
+    paddw                   m3, m11
+%else
+    paddw                   m4, m7, [rsp+0*16]
+    paddw                   m5, m7, [rsp+1*16]
+    paddw                   m6, m7, [rsp+2*16]
+    paddw                   m7, [rsp+3*16]
+    paddw                   m0, m3, [rsp+0*16]
+    paddw                   m1, m3, [rsp+1*16]
+    paddw                   m2, m3, [rsp+2*16]
+    paddw                   m3, [rsp+3*16]
+%endif
+    packuswb                m4, m5
+    packuswb                m6, m7
+    packuswb                m0, m1
+    packuswb                m2, m3
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m6
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
+%endmacro
+
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                 xm2, [aq-1], 0
+    vinserti128             m2, m2, xm2, 1
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 15
+.loop:
+    pinsrw                 xm7, [lq+cntq*2], 0
+    vinserti128             m7, m7, xm7, 1
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+    pxor                   m%4, m%1, m%3
+    pand                   m%4, [pb_1]
+    pavgb                  m%1, m%3
+    psubusb                m%1, m%4
+    pavgb                  m%1, m%2
+%endmacro
+
+%macro DL_MMX_FUNCS 0
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, m1, [pb_0to5_2x7]
+    pshufb                  m2, m1, [pb_2to6_3x7]
+%else
+    punpckhbw               m3, m1, m1              ; 44556677
+    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
+    pand                    m3, [pb_6x0_2xm1]       ; ______77
+    psrlq                   m2, m1, 16              ; 234567__
+    por                     m0, m3                  ; 01234577
+    por                     m2, m3                  ; 23456777
+%endif
+    psrlq                   m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    pshufw                  m1, m0, q3321
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    psrlq                   m0, 8
+    psrlq                   m1, 8
+    add                   dstq, strideq
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
+
+%macro DL_XMM_FUNCS 0
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+    movq                    m0, [aq]
+    lea               stride5q, [strideq*5]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, [pb_1to6_10x7]
+%else
+    punpcklbw               m1, m0, m0              ; 0011223344556677
+    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
+%endif
+    shufps                  m0, m1, q3310
+%if notcpuflag(ssse3)
+    psrldq                  m1, m0, 1
+    shufps                  m1, m0, q3210
+%endif
+    psrldq                  m2, m1, 1
+    LOWPASS                  0, 1, 2, 3
+
+    pshufd                  m1, m0, q3321
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m0, [aq]
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m1, m0, m5
+    pshufb                  m2, m1, m5
+    pshufb                  m4, m0, [pb_15]
+%else
+    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
+    por                     m1, m5                      ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
+    por                     m2, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
+%endif
+    LOWPASS                  0, 1, 2, 3
+    DEFINE_ARGS dst, stride, cnt, stride9
+    lea               stride9q, [strideq+strideq*8]
+    mov                   cntd, 4
+
+.loop:
+    movhlps                 m4, m0
+    mova      [dstq+strideq*0], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+strideq*8], m4
+    movhlps                 m4, m0
+    mova      [dstq+strideq*1], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+stride9q ], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    PALIGNR                 m2, m1, m0, 1, m4
+    PALIGNR                 m3, m1, m0, 2, m4
+    LOWPASS                  0, 2, 3, 4
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m2, m1, m5
+    pshufb                  m3, m2, m5
+    pshufb                  m6, m1, [pb_15]
+    mova                    m7, m6
+%else
+    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
+    por                     m2, m5                      ; 123456789ABCDEFF
+    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
+    por                     m3, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
+    pshufd                  m6, m7, q3333
+%endif
+    LOWPASS                  1, 2, 3, 4
+    lea                 dst16q, [dstq  +strideq*8]
+    mov                   cntd, 8
+    lea                 dst16q, [dst16q+strideq*8]
+.loop:
+    movhlps                 m7, m1
+    mova [dstq  +strideq*0+ 0], m0
+    mova [dstq  +strideq*0+16], m1
+    movhps [dstq+strideq*8+ 0], m0
+    movq [dstq  +strideq*8+ 8], m1
+    mova [dstq  +strideq*8+16], m7
+    mova [dst16q+strideq*0+ 0], m1
+    mova [dst16q+strideq*0+16], m6
+    mova [dst16q+strideq*8+ 0], m7
+    mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 1
+    pshufb                  m1, m5
+%elif cpuflag(ssse3)
+    palignr                 m2, m1, m0, 1
+    pshufb                  m1, m5
+    mova                    m0, m2
+%else
+    mova                    m4, m1
+    psrldq                  m0, 1
+    pslldq                  m4, 15
+    psrldq                  m1, 1
+    por                     m0, m4
+    por                     m1, m5
+%endif
+    add                   dstq, strideq
+    add                 dst16q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
+
+; dr
+
+%macro DR_MMX_FUNCS 0
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    movd                    m1, [aq+3]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m1, m0, 1, m3
+    psrlq                   m2, m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    movd      [dstq+stride3q ], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*2], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*1], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*0], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
+
+%macro DR_XMM_FUNCS 0
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m1, [lq]
+    movhps                  m1, [aq-1]
+    movd                    m2, [aq+7]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m3
+    LOWPASS                  0, 1, 2, 3
+
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*4]
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    RET
+
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m1, [lq]
+    movu                    m2, [aq-1]
+    movd                    m4, [aq+15]
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea               stride9q, [strideq *3]
+    mov                   cntd, 4
+    lea               stride9q, [stride9q*3]
+    PALIGNR                 m4, m2, 1, m5
+    PALIGNR                 m3, m2, m1, 15, m5
+    LOWPASS                  3,  2, 4, 5
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m4
+    LOWPASS                  0,  1, 2, 4
+
+.loop:
+    mova    [dstq+strideq*0  ], m3
+    movhps  [dstq+strideq*8+0], m0
+    movq    [dstq+strideq*8+8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    mova    [dstq+strideq*1  ], m3
+    movhps  [dstq+stride9q +0], m0
+    movq    [dstq+stride9q +8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+    mova                    m1, [lq]
+    mova                    m2, [lq+16]
+    movu                    m3, [aq-1]
+    movu                    m4, [aq+15]
+    movd                    m5, [aq+31]
+    DEFINE_ARGS dst, stride, stride8, cnt
+    lea               stride8q, [strideq*8]
+    PALIGNR                 m5, m4, 1, m7
+    PALIGNR                 m6, m4, m3, 15, m7
+    LOWPASS                  5,  4,  6,  7
+    PALIGNR                 m4, m3, 1, m7
+    PALIGNR                 m6, m3, m2, 15, m7
+    LOWPASS                  4,  3,  6,  7
+    PALIGNR                 m3, m2, 1, m7
+    PALIGNR                 m6, m2, m1, 15, m7
+    LOWPASS                  3,  2,  6,  7
+    PALIGNR                 m2, m1, 1, m6
+    pslldq                  m0, m1, 1
+    LOWPASS                  2,  1,  0,  6
+    mov                   cntd, 16
+
+    ; out=m2/m3/m4/m5
+.loop:
+    mova  [dstq+stride8q*0+ 0], m4
+    mova  [dstq+stride8q*0+16], m5
+    mova  [dstq+stride8q*2+ 0], m3
+    mova  [dstq+stride8q*2+16], m4
+    PALIGNR                 m5, m4, 15, m6
+    PALIGNR                 m4, m3, 15, m6
+    PALIGNR                 m3, m2, 15, m6
+    pslldq                  m2, 1
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
+
+; vl
+
+INIT_MMX mmxext
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    psrlq                   m1, 8
+    psrlq                   m2, 8
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    RET
+
+%macro VL_XMM_FUNCS 0
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m0, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0
+    punpckhwd               m1, m1
+    shufps                  m0, m1, q3310
+%endif
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m0, 2
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+    pshufb                  m2, m1, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
+    por                     m1, m4                  ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
+    por                     m2, m4                  ; 23456789ABCDEFFF
+%endif
+    LOWPASS                  2,  1,  0, 3
+    pavgb                   m1, m0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m5, [aq+16]
+    DEFINE_ARGS dst, stride, dst16, cnt
+    PALIGNR                 m2, m5, m0, 1, m4
+    PALIGNR                 m3, m5, m0, 2, m4
+    lea                 dst16q, [dstq  +strideq*8]
+    LOWPASS                  3,  2,  0, 6
+    pavgb                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m0, m5, m4
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
+    por                     m0, m4                  ; 123456789ABCDEFF
+    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
+    por                     m1, m4                  ; 23456789ABCDEFFF
+%endif
+    lea                 dst16q, [dst16q+strideq*8]
+    LOWPASS                  1,  0,  5, 6
+    pavgb                   m0, m5
+%if cpuflag(ssse3)
+    pshufb                  m5, [pb_15]
+%else
+    punpckhbw               m5, m4, m4
+    pshufhw                 m5, m5, q3333
+    punpckhqdq              m5, m5
+%endif
+    mov                   cntd, 8
+
+.loop:
+%macro %%write 3
+    mova    [dstq+stride%1+ 0], %2
+    mova    [dstq+stride%1+16], %3
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+    palignr                 %2, %3, %2, 1
+    pshufb                  %3, m4
+%elif cpuflag(ssse3)
+    palignr                 m6, %3, %2, 1
+    pshufb                  %3, m4
+    mova                    %2, m6
+%else
+    pslldq                  m6, %3, 15
+    psrldq                  %3, 1
+    psrldq                  %2, 1
+    por                     %3, m4
+    por                     %2, m6
+%endif
+%endmacro
+
+    %%write                q*0, m2, m0
+    %%write                q*1, m3, m1
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
+
+; vr
+
+%macro VR_MMX_FUNCS 0
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq-1]
+    punpckldq               m2, [lq]
+    movd                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 5, m3
+    psrlq                   m2, m1, 8
+    psllq                   m3, m1, 8
+    LOWPASS                  2,  1, 3, 4
+
+    ; ABCD <- for the following predictor:
+    ; EFGH
+    ; IABC  | m0 contains ABCDxxxx
+    ; JEFG  | m2 contains xJIEFGHx
+
+%if cpuflag(ssse3)
+    punpckldq               m0, m2
+    pshufb                  m2, [pb_13456_3xm1]
+    movd      [dstq+strideq*0], m0
+    pshufb                  m0, [pb_6012_4xm1]
+    movd      [dstq+stride3q ], m2
+    psrlq                   m2, 8
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+strideq*1], m2
+%else
+    psllq                   m1, m2, 40
+    psrlq                   m2, 24
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m2
+    PALIGNR                 m0, m1, 7, m3
+    psllq                   m1, 8
+    PALIGNR                 m2, m1, 7, m3
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+stride3q ], m2
+%endif
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
+
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-1]
+    movhps                  m2, [lq]
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 9, m3
+    pslldq                  m2, m1, 1
+    pslldq                  m3, m1, 2
+    LOWPASS                  1,  2, 3, 4
+
+    ; ABCDEFGH <- for the following predictor:
+    ; IJKLMNOP
+    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
+    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
+    ; SQABCDEF
+    ; TRIJKLMN
+    ; USQABCDE
+    ; VTRIJKLM
+
+%if cpuflag(ssse3)
+    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
+    movq      [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
+    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
+%else
+    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
+    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
+    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
+    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
+    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
+    psrldq                  m1, 8
+    pslldq                  m3, 8
+    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
+%endif
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    RET
+
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
+    mova                    m0, [aq]
+    movu                    m1, [aq-1]
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m3, m1, m2, 15, m6
+    LOWPASS                  3,  1,  0,  4
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2,  1, m6
+    pslldq                  m4, m2,  1
+    LOWPASS                  1,  2,  4,  5
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m5, m1, 8
+    pand                    m1, [pw_255]
+    packuswb                m1, m5
+%endif
+    mov                   cntd, 4
+
+.loop:
+    movlhps                 m2, m1
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m4, m0, m1, 15, m6
+    PALIGNR                 m5, m3, m2, 15, m6
+    mova      [dstq+strideq*2], m4
+    mova      [dstq+stride3q ], m5
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m1, 14, m6
+    PALIGNR                 m3, m2, 14, m6
+    pslldq                  m1, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    movu                    m1, [aq-1]
+    PALIGNR                 m3, m2, m0, 15, m6
+    PALIGNR                 m4, m2, m0, 14, m6
+    LOWPASS                  4,  3,  2,  5
+    pavgb                   m3, m2
+    mova                    m2, [lq+16]
+    PALIGNR                 m5, m1, m2, 15, m6
+    LOWPASS                  5,  1,  0,  6
+    pavgb                   m0, m1
+    mova                    m6, [lq]
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                [dstq], m0
+%endif
+    PALIGNR                 m1, m2,  1, m0
+    PALIGNR                 m7, m2, m6, 15, m0
+    LOWPASS                  1,  2,  7,  0
+    PALIGNR                 m2, m6,  1, m0
+    pslldq                  m7, m6,  1
+    LOWPASS                  2,  6,  7,  0
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+    pshufb                  m2, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m0, m1, 8
+    psrlw                   m6, m2, 8
+    pand                    m1, [pw_255]
+    pand                    m2, [pw_255]
+    packuswb                m1, m0
+    packuswb                m2, m6
+%endif
+    DEFINE_ARGS dst, stride, dst16, cnt
+    lea                 dst16q, [dstq  +strideq*8]
+    lea                 dst16q, [dst16q+strideq*8]
+    SBUTTERFLY             qdq,  2,  1,  6
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                    m0, [dstq]
+%endif
+    mov                   cntd, 8
+
+.loop:
+    ; even lines (0, 2, 4, ...): m1 | m0, m3
+    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+    mova    [dstq+stride%1+ 0], %3
+    mova    [dstq+stride%1+16], %4
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], %4
+    PALIGNR                 %4, %3, 15, m6
+    PALIGNR                 %3, %2, 15, m6
+    pslldq                  %2,  1
+%endmacro
+
+    %%write                q*0, m1, m0, m3
+    %%write                q*1, m2, m5, m4
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
+
+; hd
+
+INIT_MMX mmxext
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0,  3
+    pavgb                   m1, m0
+
+    ; DHIJ <- for the following predictor:
+    ; CGDH
+    ; BFCG  | m1 contains ABCDxxxx
+    ; AEBF  | m2 contains EFGHIJxx
+
+    punpcklbw               m1, m2
+    punpckhdq               m0, m1, m2
+
+    ; m1 contains AEBFCGDH
+    ; m0 contains CGDHIJxx
+
+    movd      [dstq+stride3q ], m1
+    movd      [dstq+strideq*1], m0
+    psrlq                   m1, 16
+    psrlq                   m0, 16
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+strideq*0], m0
+    RET
+
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
+    movq                    m0, [lq]
+    movhps                  m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    ; HPQRSTUV <- for the following predictor
+    ; GOHPQRST
+    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
+    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
+    ; DLEMFNGO
+    ; CKDLEMFN
+    ; BJCKDLEM
+    ; AIBJCKDL
+
+    punpcklbw               m1, m2
+    movhlps                 m2, m2
+
+    ; m1 contains AIBJCKDLEMFNGOHP
+    ; m2 contains QRSTUVxxxxxxxxxx
+
+    movhps   [dstq +stride3q ], m1
+    movq     [dst4q+stride3q ], m1
+    PALIGNR                 m3, m2, m1, 2, m4
+    movhps   [dstq +strideq*2], m3
+    movq     [dst4q+strideq*2], m3
+    PALIGNR                 m3, m2, m1, 4, m4
+    movhps   [dstq +strideq*1], m3
+    movq     [dst4q+strideq*1], m3
+    PALIGNR                 m2, m1, 6, m4
+    movhps   [dstq +strideq*0], m2
+    movq     [dst4q+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m3, [aq-1]
+    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+    lea               stride4q, [strideq*4]
+    lea                  dst4q, [dstq +stride4q]
+    lea                  dst8q, [dst4q+stride4q]
+    lea                 dst12q, [dst8q+stride4q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m1, m3, m0,  1, m6
+    PALIGNR                 m2, m3, m0,  2, m6
+    LOWPASS                  2,  1,  0,  6
+    pavgb                   m1, m0
+    SBUTTERFLY              bw,  1,  2,  6
+
+    ; I PROBABLY INVERTED L0 ad L16 here
+    ; m1, m2, m5
+.loop:
+    sub               stride4q, strideq
+    movhps [dstq +stride4q +0], m2
+    movq   [dstq +stride4q +8], m5
+    mova   [dst4q+stride4q   ], m2
+    movhps [dst8q+stride4q +0], m1
+    movq   [dst8q+stride4q +8], m2
+    mova  [dst12q+stride4q   ], m1
+%if cpuflag(avx)
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m5, m2, 2
+%elif cpuflag(ssse3)
+    palignr                 m3, m2, m1, 2
+    palignr                 m0, m5, m2, 2
+    mova                    m1, m3
+    mova                    m2, m0
+%else
+    ; slightly modified version of PALIGNR
+    mova                    m6, m2
+    mova                    m4, m5
+    pslldq                  m6, 14
+    pslldq                  m4, 14
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    por                     m1, m6
+    por                     m2, m4
+%endif
+    psrldq                  m5, 2
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    movu                    m2, [aq-1]
+    movu                    m3, [aq+15]
+    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+    lea               stride8q, [strideq*8]
+    lea                  dst8q, [dstq  +stride8q]
+    lea                 dst16q, [dst8q +stride8q]
+    lea                 dst24q, [dst16q+stride8q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m4, m3, m2,  2, m6
+    PALIGNR                 m3, m2,  1, m6
+    LOWPASS                  4,  3,  2,  6
+    PALIGNR                 m3, m2, m1,  2, m6
+    PALIGNR                 m2, m1,  1, m6
+    LOWPASS                  3,  2,  1,  6
+    pavgb                   m2, m1
+    PALIGNR                 m6, m1, m0,  1, m7
+    PALIGNR                 m1, m0,  2, m7
+    LOWPASS                  1,  6,  0,  7
+    pavgb                   m0, m6
+    SBUTTERFLY              bw,  2,  3,  6
+    SBUTTERFLY              bw,  0,  1,  6
+
+    ; m0, m1, m2, m3, m4, m5
+.loop:
+    sub               stride8q, strideq
+    mova  [dstq  +stride8q+ 0], m3
+    mova  [dstq  +stride8q+16], m4
+    mova  [dst8q +stride8q+ 0], m2
+    mova  [dst8q +stride8q+16], m3
+    mova  [dst16q+stride8q+ 0], m1
+    mova  [dst16q+stride8q+16], m2
+    mova  [dst24q+stride8q+ 0], m0
+    mova  [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+    palignr                 m0, m1, m0, 2
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m4, m3, 2
+    palignr                 m4, m5, m4, 2
+    psrldq                  m5, 2
+%elif cpuflag(ssse3)
+    psrldq                  m6, m5, 2
+    palignr                 m5, m4, 2
+    palignr                 m4, m3, 2
+    palignr                 m3, m2, 2
+    palignr                 m2, m1, 2
+    palignr                 m1, m0, 2
+    mova                    m0, m1
+    mova                    m1, m2
+    mova                    m2, m3
+    mova                    m3, m4
+    mova                    m4, m5
+    mova                    m5, m6
+%else
+    ; sort of a half-integrated version of PALIGNR
+    pslldq                  m7, m4, 14
+    pslldq                  m6, m5, 14
+    psrldq                  m4, 2
+    psrldq                  m5, 2
+    por                     m4, m6
+    pslldq                  m6, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    pslldq                  m7, m2, 14
+    psrldq                  m2, 2
+    por                     m2, m6
+    pslldq                  m6, m1, 14
+    psrldq                  m1, 2
+    por                     m1, m7
+    psrldq                  m0, 2
+    por                     m0, m6
+%endif
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
+
+%macro HU_MMX_FUNCS 0
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to2_5x3]
+%else
+    punpcklbw               m1, m0, m0          ; 00112233
+    pshufw                  m1, m1, q3333       ; 33333333
+    punpckldq               m0, m1              ; 01233333
+%endif
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    SBUTTERFLY              bw,  1, 2, 0
+    PALIGNR                 m2, m1, 2, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    punpckhdq               m1, m1
+    punpckhdq               m2, m2
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+stride3q ], m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+    movq                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0          ; 0011223344556677
+    punpckhwd               m1, m1              ; 4444555566667777
+    shufps                  m0, m1, q3310       ; 0123456777777777
+%endif
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    SBUTTERFLY              bw,  1, 2, 0
+    movq     [dstq +strideq*0], m1
+    movhps   [dst4q+strideq*0], m1
+    PALIGNR                 m0, m2, m1, 2, m3
+    movq     [dstq +strideq*1], m0
+    movhps   [dst4q+strideq*1], m0
+    PALIGNR                 m0, m2, m1, 4, m3
+    movq     [dstq +strideq*2], m0
+    movhps   [dst4q+strideq*2], m0
+    PALIGNR                 m2, m1, 6, m3
+    movq     [dstq +stride3q ], m2
+    movhps   [dst4q+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2toE_3xF]
+    pshufb                  m1, m0, [pb_1toE_2xF]
+    pshufb                  m2, m0, m3
+%else
+    pand                    m3, m0, [pb_15x0_1xm1]
+    psrldq                  m1, m0, 1
+    por                     m1, m3
+    punpckhbw               m3, m3
+    psrldq                  m2, m0, 2
+    por                     m2, m3
+%endif
+    LOWPASS                  2,  1,  0,  4
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea                stride9q, [strideq*8+strideq]
+    mov                   cntd,  4
+    SBUTTERFLY              bw,  1,  2,  0
+
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*8], m2
+    PALIGNR                 m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride9q ], m2
+    PALIGNR                 m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
+    mova                    m1, [lq]
+    mova                    m0, [lq+16]
+    PALIGNR                 m2, m0, m1,  1, m5
+    PALIGNR                 m3, m0, m1,  2, m5
+    LOWPASS                  3,  2,  1,  5
+    pavgb                   m2, m1
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2toE_3xF]
+    pshufb                  m5, m0, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]
+    psrldq                  m5, m0, 1
+    por                     m5, m4
+    punpckhbw               m4, m4
+    psrldq                  m1, m0, 2
+    por                     m1, m4
+%endif
+    LOWPASS                  1,  5,  0,  6
+    pavgb                   m0, m5
+    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+    mov                   cntd,  8
+    xor               stride0q, stride0q
+    lea                  dst8q, [dstq  +strideq*8]
+    lea                 dst16q, [dst8q +strideq*8]
+    lea                 dst24q, [dst16q+strideq*8]
+    SBUTTERFLY              bw,  0,  1,  5
+    SBUTTERFLY              bw,  2,  3,  5
+%if cpuflag(ssse3)
+    pshufb                  m6, m1, [pb_15]
+%else
+    pshufhw                 m6, m4, q3333
+    punpckhqdq              m6, m6
+%endif
+
+.loop:
+    mova  [dstq  +stride0q+ 0], m2
+    mova  [dstq  +stride0q+16], m3
+    mova  [dst8q +stride0q+ 0], m3
+    mova  [dst8q +stride0q+16], m0
+    mova  [dst16q+stride0q+ 0], m0
+    mova  [dst16q+stride0q+16], m1
+    mova  [dst24q+stride0q+ 0], m1
+    mova  [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m0, m3, 2
+    palignr                 m0, m1, m0, 2
+    pshufb                  m1, m4
+%elif cpuflag(ssse3)
+    pshufb                  m5, m1, m4
+    palignr                 m1, m0, 2
+    palignr                 m0, m3, 2
+    palignr                 m3, m2, 2
+    mova                    m2, m3
+    mova                    m3, m0
+    mova                    m0, m1
+    mova                    m1, m5
+%else
+    ; half-integrated version of PALIGNR
+    pslldq                  m5, m1, 14
+    pslldq                  m7, m0, 14
+    psrldq                  m1, 2
+    psrldq                  m0, 2
+    por                     m1, m4
+    por                     m0, m5
+    pslldq                  m5, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    psrldq                  m2, 2
+    por                     m2, m5
+%endif
+    add               stride0q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000000..212e4130e8
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,2174 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+cextern pd_65535;
+
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    mova                    m3, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m2
+    mova   [dstq+strideq*1+48], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+    mova                    m3, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    punpckhwd               m3, m2, m2
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m3, q1111
+    pshufd                  m1, m3, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    punpcklwd               m2, m2
+    pshufd                  m0, m2, q3333
+    pshufd                  m1, m2, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m2, q1111
+    pshufd                  m1, m2, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 3
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova    [dstq+strideq*0+ 0], m0
+    mova    [dstq+strideq*0+16], m0
+    mova    [dstq+strideq*1+ 0], m1
+    mova    [dstq+strideq*1+16], m1
+    mova    [dstq+strideq*2+ 0], m2
+    mova    [dstq+strideq*2+16], m2
+    mova    [dstq+stride3q + 0], m3
+    mova    [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 7
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m1
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+strideq*2+32], m2
+    mova   [dstq+strideq*2+48], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    mova   [dstq+stride3q +32], m3
+    mova   [dstq+stride3q +48], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [lq+mmsize]
+    paddw                   m0, [aq]
+    paddw                   m0, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq+mmsize*0]
+    paddw                   m0, [lq+mmsize*1]
+    paddw                   m0, [lq+mmsize*2]
+    paddw                   m0, [lq+mmsize*3]
+    paddw                   m0, [aq+mmsize*0]
+    paddw                   m0, [aq+mmsize*1]
+    paddw                   m0, [aq+mmsize*2]
+    paddw                   m0, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_32]
+    paddd                   m0, m1
+    psrad                   m0, 6
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_2]
+    paddd                   m0, m1
+    psrad                   m0, 2
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    paddw                   m0, [%2+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2+mmsize*0]
+    paddw                   m0, [%2+mmsize*1]
+    paddw                   m0, [%2+mmsize*2]
+    paddw                   m0, [%2+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DC_1D_FNS top,  aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_1023]
+.body:
+    mova                    m4, [aq]
+    mova                    m3, [lq]
+    movd                    m0, [aq-4]
+    pshufw                  m0, m0, q1111
+    psubw                   m4, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    paddw                   m0, m4
+    paddw                   m1, m4
+    paddw                   m2, m4
+    paddw                   m3, m4
+    pxor                    m4, m4
+    pmaxsw                  m0, m4
+    pmaxsw                  m1, m4
+    pmaxsw                  m2, m4
+    pmaxsw                  m3, m4
+    pminsw                  m0, m5
+    pminsw                  m1, m5
+    pminsw                  m2, m5
+    pminsw                  m3, m5
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m5, [aq]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 1
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m5
+    paddw                   m1, m5
+    paddw                   m2, m5
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m4
+    pminsw                  m1, m4
+    pminsw                  m2, m4
+    pminsw                  m3, m4
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m4, [aq]
+    mova                    m5, [aq+mmsize]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+    punpcklwd               m3, m3
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m2, m4
+    paddw                   m2, m5
+    paddw                   m1, m3, m4
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m7
+    pminsw                  m2, m7
+    pminsw                  m1, m7
+    pminsw                  m3, m7
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_1023]
+.body:
+    pxor                    m1, m1
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+    mova              [rsp+ 0], m0
+    mova              [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+    mova                    m4, [aq+mmsize*0]
+    mova                    m5, [aq+mmsize*1]
+    mova                    m6, [aq+mmsize*2]
+    mova                    m7, [aq+mmsize*3]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    psubw                   m6, m0
+    psubw                   m7, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 31
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+    punpcklwd               m3, m3
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m3, m4
+    paddw                   m1, m3, m5
+    paddw                   m2, m3, m6
+    paddw                   m3, m7
+    pmaxsw                  m0, reg_min
+    pmaxsw                  m1, reg_min
+    pmaxsw                  m2, reg_min
+    pmaxsw                  m3, reg_min
+    pminsw                  m0, reg_max
+    pminsw                  m1, reg_max
+    pminsw                  m2, reg_max
+    pminsw                  m3, reg_max
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    add                   dstq, strideq
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra predicion functions
+;
+; in the functions below, 'abcdefgh' refers to above data (sometimes simply
+; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
+; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
+; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
+; top-left data.
+
+; left=(left+2*center+right+2)>>2
+%macro LOWPASS 3 ; left [dst], center, right
+    paddw                  m%1, m%3
+    psraw                  m%1, 1
+    pavgw                  m%1, m%2
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst)
+; dst/src can be the same register
+%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
+%else
+    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+%endif
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
+%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
+    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
+%else
+    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
+    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
+%endif
+%endmacro
+
+%macro DL_FUNCS 0
+cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m1, [aq]                ; abcdefgh
+    pshufhw                 m0, m1, q3310           ; abcdefhh
+    SHIFT_RIGHT             m1, m1                  ; bcdefghh
+    psrldq                  m2, m1, 2               ; cdefghh.
+    LOWPASS                  0,  1,  2              ; BCDEFGh.
+    pshufd                  m1, m0, q3321           ; DEFGh...
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    add                   dstq, strideq
+    psrldq                  m0, 2                   ; CDEFGh..
+    psrldq                  m1, 2                   ; EFGh....
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    RET
+
+cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
+    LOWPASS                  0,  1,  2              ; BCDEFGHh
+    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
+    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
+    DEFINE_ARGS dst, stride, stride5
+    lea               stride5q, [strideq*5]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*4], m1
+    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
+    pshuflw                 m1, m1, q3321           ; GHhhhhhh
+    pshufd                  m2, m0, q3321           ; EFGHhhhh
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
+    mova      [dstq+strideq*0], m3
+    mova      [dstq+strideq*4], m1
+    pshuflw                 m1, m1, q3321           ; hhhhhhhh
+    mova      [dstq+strideq*1], m2
+    mova      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+    mova                    m3, [aq+mmsize]         ; ijklmnop
+    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
+    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
+    LOWPASS                  0,  1,  2              ; BCDEFGHI
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
+    LOWPASS                  1,  2,  3              ; JKLMNOPp
+    pshufd                  m2, m2, q3333           ; pppppppp
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*8+ 0], m1
+    mova   [dstq+strideq*8+16], m2
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+%else
+    PALIGNR                 m3, m1, m0, 2, m4
+    mova                    m0, m3
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]       ; abcdefgh
+    mova                    m1, [aq+mmsize*1]       ; ijklmnop
+    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
+    mova                    m3, [aq+mmsize*3]       ; yz012345
+    PALIGNR                 m4, m1, m0, 2, m6
+    PALIGNR                 m5, m1, m0, 4, m6
+    LOWPASS                  0,  4,  5              ; BCDEFGHI
+    PALIGNR                 m4, m2, m1, 2, m6
+    PALIGNR                 m5, m2, m1, 4, m6
+    LOWPASS                  1,  4,  5              ; JKLMNOPQ
+    PALIGNR                 m4, m3, m2, 2, m6
+    PALIGNR                 m5, m3, m2, 4, m6
+    LOWPASS                  2,  4,  5              ; RSTUVWXY
+%if cpuflag(ssse3)
+    mova                    m6, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m4, m5, m3, m6
+    LOWPASS                  3,  4,  5              ; Z0123455
+    pshufd                  m4, m4, q3333           ; 55555555
+    DEFINE_ARGS dst, stride, stride8, stride24, cnt
+    mov                   cntd, 8
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+
+.loop:
+    mova  [dstq+stride8q*0+ 0], m0
+    mova  [dstq+stride8q*0+16], m1
+    mova  [dstq+stride8q*0+32], m2
+    mova  [dstq+stride8q*0+48], m3
+    mova  [dstq+stride8q*1+ 0], m1
+    mova  [dstq+stride8q*1+16], m2
+    mova  [dstq+stride8q*1+32], m3
+    mova  [dstq+stride8q*1+48], m4
+    mova  [dstq+stride8q*2+ 0], m2
+    mova  [dstq+stride8q*2+16], m3
+    mova  [dstq+stride8q*2+32], m4
+    mova  [dstq+stride8q*2+48], m4
+    mova  [dstq+stride24q + 0], m3
+    mova  [dstq+stride24q +16], m4
+    mova  [dstq+stride24q +32], m4
+    mova  [dstq+stride24q +48], m4
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+    vpalignr                m1, m2, m1, 2
+    vpalignr                m2, m3, m2, 2
+%else
+    PALIGNR                 m5, m1, m0, 2, m6
+    mova                    m0, m5
+    PALIGNR                 m5, m2, m1, 2, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 2, m6
+    mova                    m2, m5
+%endif
+    SHIFT_RIGHT             m3, m3, m6
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_FUNCS
+INIT_XMM ssse3
+DL_FUNCS
+INIT_XMM avx
+DL_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                   ; abcdefghijklmnop
+    vpbroadcastw           xm1, [aq+30]                ; pppppppp
+    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
+    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
+    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
+    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
+    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
+    DEFINE_ARGS dst, stride, stride3, cnt
+    mov                   cntd, 2
+    lea               stride3q, [strideq*3]
+.loop:
+    mova      [dstq+strideq*0], m0
+    vpalignr                m3, m2, m0, 2
+    vpalignr                m4, m2, m0, 4
+    mova      [dstq+strideq*1], m3
+    mova      [dstq+strideq*2], m4
+    vpalignr                m3, m2, m0, 6
+    vpalignr                m4, m2, m0, 8
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m4
+    vpalignr                m3, m2, m0, 10
+    vpalignr                m4, m2, m0, 12
+    mova      [dstq+strideq*1], m3
+    mova      [dstq+strideq*2], m4
+    vpalignr                m3, m2, m0, 14
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    mova                    m0, m2
+    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; wxyz....
+    movhps                  m0, [aq-2]              ; wxyz*abc
+    movd                    m1, [aq+6]              ; d.......
+    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
+    psrldq                  m2, m1, 2               ; yz*abcd.
+    LOWPASS                  0, 1, 2                ; XYZ#ABC.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m0
+    psrldq                  m0, 2                   ; YZ#ABC..
+    movh      [dstq+strideq*2], m0
+    psrldq                  m0, 2                   ; Z#ABC...
+    movh      [dstq+strideq*1], m0
+    psrldq                  m0, 2                   ; #ABC....
+    movh      [dstq+strideq*0], m0
+    RET
+
+cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]                ; stuvwxyz
+    movu                    m1, [aq-2]              ; *abcdefg
+    mova                    m2, [aq]                ; abcdefgh
+    psrldq                  m3, m2, 2               ; bcdefgh.
+    LOWPASS                  3,  2, 1               ; ABCDEFG.
+    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
+    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
+    LOWPASS                  2,  1, 0               ; TUVWXYZ#
+    DEFINE_ARGS dst, stride, dst4, stride3
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+
+    movhps [dstq +stride3q +0], m2
+    movh   [dstq+ stride3q +8], m3
+    mova   [dst4q+stride3q +0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*2+0], m1
+    movh   [dstq+ strideq*2+8], m3
+    mova   [dst4q+strideq*2+0], m1
+    PALIGNR                 m2, m3, m1, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*1+0], m2
+    movh   [dstq+ strideq*1+8], m3
+    mova   [dst4q+strideq*1+0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*0+0], m1
+    movh   [dstq+ strideq*0+8], m3
+    mova   [dst4q+strideq*0+0], m1
+    RET
+
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [lq]                ; klmnopqr
+    mova                    m1, [lq+mmsize]         ; stuvwxyz
+    movu                    m2, [aq-2]              ; *abcdefg
+    movu                    m3, [aq+mmsize-2]       ; hijklmno
+    mova                    m4, [aq]                ; abcdefgh
+    mova                    m5, [aq+mmsize]         ; ijklmnop
+    psrldq                  m6, m5, 2               ; jklmnop.
+    LOWPASS                  6,  5, 3               ; IJKLMNO.
+    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
+    LOWPASS                  5,  4, 2               ; ABCDEFGH
+    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
+    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
+    LOWPASS                  4,  2, 1               ; TUVWXYZ#
+    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
+    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
+    LOWPASS                  2, 1, 0                ; LMNOPQRS
+    DEFINE_ARGS dst, stride, dst8, cnt
+    lea                  dst8q, [dstq+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+    mova  [dst8q+strideq*0+ 0], m4
+    mova  [dst8q+strideq*0+16], m5
+    mova  [dst8q+strideq*8+ 0], m2
+    mova  [dst8q+strideq*8+16], m4
+%if cpuflag(avx)
+    vpalignr                m2, m4, m2, 2
+    vpalignr                m4, m5, m4, 2
+    vpalignr                m5, m6, m5, 2
+%else
+    PALIGNR                 m0, m4, m2, 2, m1
+    mova                    m2, m0
+    PALIGNR                 m0, m5, m4, 2, m1
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m5, 2, m1
+    mova                    m5, m0
+%endif
+    psrldq                  m6, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
+                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
+    mova                    m0, [aq+mmsize*3]       ; a[24-31]
+    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
+    psrldq                  m2, m0, 2               ; a[25-31].
+    LOWPASS                  2,  0, 1               ; A[24-30].
+    mova                    m1, [aq+mmsize*2]       ; a[16-23]
+    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
+    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
+    LOWPASS                  0,  1, 3               ; A[16-23]
+    mova                    m3, [aq+mmsize*1]       ; a[8-15]
+    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
+    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
+    LOWPASS                  1,  3, 4               ; A[8-15]
+    mova                    m4, [aq+mmsize*0]       ; a[0-7]
+    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
+    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
+    LOWPASS                  3,  4, 5               ; A[0-7]
+    SCRATCH                  1,  8, rsp+0*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0, 10, rsp+2*mmsize
+%endif
+    mova                    m6, [lq+mmsize*3]       ; l[24-31]
+    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
+    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
+    LOWPASS                  4,  5, 6               ; L[25-31]#
+    mova                    m7, [lq+mmsize*2]       ; l[16-23]
+    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
+    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
+    LOWPASS                  5,  6, 7               ; L[17-24]
+    mova                    m1, [lq+mmsize*1]       ; l[8-15]
+    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
+    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
+    LOWPASS                  6,  7, 1               ; L[9-16]
+    mova                    m3, [lq+mmsize*0]       ; l[0-7]
+    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
+    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
+    LOWPASS                  7,  1, 3               ; L[1-8]
+%if cpuflag(ssse3)
+%if cpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%endif
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%else
+    UNSCRATCH                0, 10, rsp+2*mmsize
+%endif
+    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+    lea                  dst8q, [dst8q+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+%if notcpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%endif
+%endif
+    mova [dst8q+stride8q*0+ 0], m4
+    mova [dst8q+stride8q*0+16], m3
+    mova [dst8q+stride8q*0+32], m1
+    mova [dst8q+stride8q*0+48], m0
+    mova [dst8q+stride8q*1+ 0], m5
+    mova [dst8q+stride8q*1+16], m4
+    mova [dst8q+stride8q*1+32], m3
+    mova [dst8q+stride8q*1+48], m1
+    mova [dst8q+stride8q*2+ 0], m6
+    mova [dst8q+stride8q*2+16], m5
+    mova [dst8q+stride8q*2+32], m4
+    mova [dst8q+stride8q*2+48], m3
+    mova [dst8q+stride24q + 0], m7
+    mova [dst8q+stride24q +16], m6
+    mova [dst8q+stride24q +32], m5
+    mova [dst8q+stride24q +48], m4
+%if cpuflag(avx)
+    vpalignr                m7, m6, m7, 2
+    vpalignr                m6, m5, m6, 2
+    vpalignr                m5, m4, m5, 2
+    vpalignr                m4, m3, m4, 2
+    vpalignr                m3, m1, m3, 2
+    vpalignr                m1, m0, m1, 2
+    vpalignr                m0, m2, m0, 2
+%else
+    SCRATCH                  2,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m6, m7, 2, m0
+    mova                    m7, m2
+    PALIGNR                 m2, m5, m6, 2, m0
+    mova                    m6, m2
+    PALIGNR                 m2, m4, m5, 2, m0
+    mova                    m5, m2
+    PALIGNR                 m2, m3, m4, 2, m0
+    mova                    m4, m2
+    PALIGNR                 m2, m1, m3, 2, m0
+    mova                    m3, m2
+%if notcpuflag(ssse3)
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m0, m1, 2, m3
+    mova                    m1, m2
+    UNSCRATCH                2,  8, rsp+0*mmsize
+    SCRATCH                  1,  8, rsp+0*mmsize
+    PALIGNR                 m1, m2, m0, 2, m3
+    mova                    m0, m1
+%endif
+    psrldq                  m2, 2
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_FUNCS 3
+INIT_XMM ssse3
+DR_FUNCS 2
+INIT_XMM avx
+DR_FUNCS 2
+
+%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m0, [aq]                ; abcdefgh
+    psrldq                  m1, m0, 2               ; bcdefgh.
+    psrldq                  m2, m0, 4               ; cdefgh..
+    LOWPASS                  2,  1, 0               ; BCDEFGH.
+    pavgw                   m1, m0                  ; ABCDEFG.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1
+    movh      [dstq+strideq*1], m2
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    movh      [dstq+strideq*2], m1
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
+    LOWPASS                  2,  1, 0               ; BCDEFGHh
+    pavgw                   m1, m0                  ; ABCDEFGh
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m5, m0, m1, m4
+    LOWPASS                  0,  5,  1
+    pavgw                   m1, m5
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m2
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m3
+    mova   [dstq+strideq*1+16], m0
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m2, m1, m2, 2
+    vpalignr                m3, m0, m3, 2
+%else
+    PALIGNR                 m5, m1, m2, 2, m4
+    mova                    m2, m5
+    PALIGNR                 m5, m0, m3, 2, m4
+    mova                    m3, m5
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    SHIFT_RIGHT             m0, m0, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    PALIGNR                 m6, m1, m0, 2, m5
+    PALIGNR                 m7, m1, m0, 4, m5
+    LOWPASS                  7,  6,  0
+    pavgw                   m6, m0
+    SCRATCH                  6,  8, rsp+0*mmsize
+    PALIGNR                 m4, m2, m1, 2, m0
+    PALIGNR                 m5, m2, m1, 4, m0
+    LOWPASS                  5,  4,  1
+    pavgw                   m4, m1
+    mova                    m0, [aq+mmsize*3]
+    PALIGNR                 m1, m0, m2, 2, m6
+    PALIGNR                 m3, m0, m2, 4, m6
+    LOWPASS                  3,  1,  2
+    pavgw                   m2, m1
+%if cpuflag(ssse3)
+    PRELOAD                 10, pb_2to15_14_15, shuf
+%endif
+    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
+    LOWPASS                  1,  6,  0
+    pavgw                   m0, m6
+%if ARCH_X86_64
+    pshufd                  m9, m6, q3333
+%endif
+%if cpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride16, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+    lea              stride17q, [stride16q+strideq]
+
+    ; FIXME m8 is unused for avx, so we could save one register here for win64
+.loop:
+%if notcpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    mova   [dstq+strideq*0+ 0], m6
+    mova   [dstq+strideq*0+16], m4
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m7
+    mova   [dstq+strideq*1+16], m5
+    mova   [dstq+strideq*1+32], m3
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+stride16q+ 0], m4
+    mova   [dstq+stride16q+16], m2
+    mova   [dstq+stride16q+32], m0
+%if ARCH_X86_64
+    mova   [dstq+stride16q+48], m9
+%endif
+    mova   [dstq+stride17q+ 0], m5
+    mova   [dstq+stride17q+16], m3
+    mova   [dstq+stride17q+32], m1
+%if ARCH_X86_64
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m6, m4, m6, 2
+    vpalignr                m4, m2, m4, 2
+    vpalignr                m2, m0, m2, 2
+    vpalignr                m7, m5, m7, 2
+    vpalignr                m5, m3, m5, 2
+    vpalignr                m3, m1, m3, 2
+%else
+    SCRATCH                  3,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  1, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m3, m4, m6, 2, m1
+    mova                    m6, m3
+    PALIGNR                 m3, m2, m4, 2, m1
+    mova                    m4, m3
+    PALIGNR                 m3, m0, m2, 2, m1
+    mova                    m2, m3
+    PALIGNR                 m3, m5, m7, 2, m1
+    mova                    m7, m3
+    UNSCRATCH                3,  8, rsp+0*mmsize
+    SCRATCH                  6,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                1, 10, rsp+1*mmsize
+    SCRATCH                  7, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m6, m3, m5, 2, m7
+    mova                    m5, m6
+    PALIGNR                 m6, m1, m3, 2, m7
+    mova                    m3, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+1*mmsize
+%endif
+%endif
+    SHIFT_RIGHT             m1, m1, reg_shuf
+    SHIFT_RIGHT             m0, m0, reg_shuf
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+%assign %%n 0
+%rep 4
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+48], m0
+    mova   [dstq+strideq*2+48], m0
+    mova   [dstq+stride3q +48], m0
+%if %%n < 3
+    lea                   dstq, [dstq+strideq*4]
+%endif
+%assign %%n (%%n+1)
+%endrep
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_FUNCS 2
+INIT_XMM ssse3
+VL_FUNCS 1
+INIT_XMM avx
+VL_FUNCS 1
+
+%macro VR_FUNCS 0
+cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movu                    m0, [aq-2]
+    movhps                  m1, [lq]
+    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
+    pslldq                  m1, m0, 2               ; .xyz*abc
+    pslldq                  m2, m0, 4               ; ..xyz*ab
+    LOWPASS                  2,  1, 0               ; ..YZ#ABC
+    pavgw                   m1, m0                  ; ....#ABC
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movhps    [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m2
+    shufps                  m0, m2, m1, q3210
+%if cpuflag(ssse3)
+    pshufb                  m2, [pb_4_5_8to13_8x0]
+%else
+    pshuflw                 m2, m2, q2222
+    psrldq                  m2, 6
+%endif
+    psrldq                  m0, 6
+    movh      [dstq+strideq*2], m0
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [lq]                ; stuvwxyz
+    mova                    m0, [aq]                ; abcdefgh
+    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
+    LOWPASS                  3,  1,  0
+    pavgw                   m0, m1
+    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
+    pslldq                  m4, m2,  2              ; .stuvwxy
+    LOWPASS                  4,  2,  1
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m4
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [aq+mmsize-2]       ; hijklmno
+    mova                    m3, [aq]                ; abcdefgh
+    mova                    m4, [aq+mmsize]         ; ijklmnop
+    mova                    m5, [lq+mmsize]         ; stuvwxyz
+    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
+    movu                    m6, [aq+mmsize-4]       ; ghijklmn
+    LOWPASS                  6,  2,  4
+    pavgw                   m2, m4
+    LOWPASS                  0,  1,  3
+    pavgw                   m3, m1
+    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
+    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
+    LOWPASS                  1,  5,  7
+    movu                    m5, [lq+2]              ; lmnopqrs
+    pslldq                  m4, m5,  2              ; .lmnopqr
+    pslldq                  m7, m5,  4              ; ..lmnopq
+    LOWPASS                  5,  4,  7
+    psrld                   m4, m1, 16
+    psrld                   m7, m5, 16
+    pand                    m1, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m7, m4
+    packssdw                m5, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m3
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m6
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m2, m3, 14, m4
+    PALIGNR                 m3, m7, 14, m4
+    pslldq                  m7, 2
+    PALIGNR                 m6, m0, 14, m4
+    PALIGNR                 m0, m5, 14, m4
+    pslldq                  m5, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
+    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
+    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
+    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
+    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
+    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
+    LOWPASS                  5,  3,  4              ; A[23-30]
+    SCRATCH                  5,  8, rsp+0*mmsize
+    pavgw                   m3, m4
+    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
+    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
+    LOWPASS                  6,  2,  4              ; A[15-22]
+    SCRATCH                  6,  9, rsp+1*mmsize
+    pavgw                   m2, m4
+    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
+    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
+    LOWPASS                  7,  1,  4              ; A[7-14]
+    SCRATCH                  7, 10, rsp+2*mmsize
+    pavgw                   m1, m4
+    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
+    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
+    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
+    LOWPASS                  6,  0,  4              ; #A[0-6]
+    SCRATCH                  6, 11, rsp+3*mmsize
+    pavgw                   m4, m0
+    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
+    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
+    LOWPASS                  0,  5,  7              ; L[24-31]
+    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
+    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
+    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
+    LOWPASS                  5,  7,  6              ; L[16-23]
+    psrld                   m7, m0, 16
+    psrld                   m6, m5, 16
+    pand                    m0, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m6, m7
+    packssdw                m5, m0
+    SCRATCH                  5, 12, rsp+4*mmsize
+    SCRATCH                  6, 13, rsp+5*mmsize
+    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
+    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
+    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
+    LOWPASS                  6,  0,  5              ; L[8-15]
+    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
+    pslldq                  m5, m0,  2              ; .l[1-7]
+    pslldq                  m7, m0,  4              ; ..l[1-6]
+    LOWPASS                  0,  5,  7
+    psrld                   m5, m6, 16
+    psrld                   m7, m0, 16
+    pand                    m6, [pd_65535]
+    pand                    m0, [pd_65535]
+    packssdw                m7, m5
+    packssdw                m0, m6
+    UNSCRATCH                6, 13, rsp+5*mmsize
+    DEFINE_ARGS dst, stride, stride16, cnt, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+%if ARCH_X86_64
+    lea              stride17q, [stride16q+strideq]
+%endif
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+%if ARCH_X86_64
+    mova   [dstq+strideq*1+ 0], m11
+    mova   [dstq+strideq*1+16], m10
+    mova   [dstq+strideq*1+32], m9
+    mova   [dstq+strideq*1+48], m8
+%endif
+    mova   [dstq+stride16q+ 0], m6
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m1
+    mova   [dstq+stride16q+48], m2
+%if ARCH_X86_64
+    mova   [dstq+stride17q+ 0], m12
+    mova   [dstq+stride17q+16], m11
+    mova   [dstq+stride17q+32], m10
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m3, m2,  14, m5
+    PALIGNR                 m2, m1,  14, m5
+    PALIGNR                 m1, m4,  14, m5
+    PALIGNR                 m4, m6,  14, m5
+    PALIGNR                 m6, m7,  14, m5
+    pslldq                  m7, 2
+%if ARCH_X86_64
+    PALIGNR                 m8, m9,  14, m5
+    PALIGNR                 m9, m10, 14, m5
+    PALIGNR                m10, m11, 14, m5
+    PALIGNR                m11, m12, 14, m5
+    PALIGNR                m12, m0,  14, m5
+    pslldq                  m0, 2
+%endif
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                5, 12, rsp+4*mmsize
+    UNSCRATCH                4, 11, rsp+3*mmsize
+    UNSCRATCH                3, 10, rsp+2*mmsize
+    UNSCRATCH                2,  9, rsp+1*mmsize
+    UNSCRATCH                1,  8, rsp+0*mmsize
+    mov                   dstq, dstm
+    mov                   cntd, 8
+    add                   dstq, strideq
+.loop2:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m3
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m1
+    mova   [dstq+stride16q+ 0], m5
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m3
+    mova   [dstq+stride16q+48], m2
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m1, m2,  14, m6
+    PALIGNR                 m2, m3,  14, m6
+    PALIGNR                 m3, m4,  14, m6
+    PALIGNR                 m4, m5,  14, m6
+    PALIGNR                 m5, m0,  14, m6
+    pslldq                  m0, 2
+    dec                   cntd
+    jg .loop2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_FUNCS
+INIT_XMM ssse3
+VR_FUNCS
+INIT_XMM avx
+VR_FUNCS
+
+%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; abcd
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
+%else
+    punpcklqdq              m0, m0
+    pshufhw                 m0, m0, q3333           ; abcddddd
+%endif
+    psrldq                  m1, m0,  2              ; bcddddd.
+    psrldq                  m2, m0,  4              ; cddddd..
+    LOWPASS                  2,  1,  0              ; BCDddd..
+    pavgw                   m1, m0                  ; abcddddd
+    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
+    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1                  ; aBbC
+    movh      [dstq+strideq*1], m2                  ; bCcD
+    movhps    [dstq+strideq*2], m1                  ; cDdd
+    movhps    [dstq+stride3q ], m2                  ; dddd
+    RET
+
+cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY          wd,  1,  2,  0
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+%else
+    PALIGNR                 m0, m2, m1, 4, m3
+    mova                    m1, m0
+%endif
+    pshufd                  m2, m2, q3321
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    RET
+
+cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m3, [lq+mmsize]
+    movu                    m1, [lq+2]
+    movu                    m2, [lq+4]
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY           wd, 1,  2,  0
+%if cpuflag(ssse3)
+    mova                    m5, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m0, m4, m3, m5
+    LOWPASS                  4,  0,  3
+    pavgw                   m3, m0
+    SBUTTERFLY           wd, 3,  4,  5
+    pshufd                  m0, m0, q3333
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+
+.loop:
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m2
+    mova  [dstq+strideq *4+ 0], m2
+    mova  [dstq+strideq *4+16], m3
+    mova  [dstq+strideq *8+ 0], m3
+    mova  [dstq+strideq *8+16], m4
+    mova  [dstq+stride3q*4+ 0], m4
+    mova  [dstq+stride3q*4+16], m0
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m4, m3, 4
+    vpalignr                m4, m0, m4, 4
+%else
+    PALIGNR                 m5, m2, m1, 4, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 4, m6
+    mova                    m2, m5
+    PALIGNR                 m5, m4, m3, 4, m6
+    mova                    m3, m5
+    PALIGNR                 m5, m0, m4, 4, m6
+    mova                    m4, m5
+%endif
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
+                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    SCRATCH                  1,  8, rsp+0*mmsize
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m7, [lq+mmsize*3+0]
+    SCRATCH                  0,  9, rsp+1*mmsize
+%if cpuflag(ssse3)
+    mova                    m0, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m6, m7, m0
+    LOWPASS                  6,  1,  7
+    pavgw                   m7, m1
+    SBUTTERFLY           wd, 7,  6,  0
+    pshufd                  m1, m1, q3333
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    lea               stride3q, [strideq*3]
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+    mov                   cntd, 4
+
+.loop:
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+1*mmsize], m1
+    mova                    m1, [rsp+0*mmsize]
+%endif
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m0
+    mova  [dstq+strideq *0+32], m3
+    mova  [dstq+strideq *0+48], m2
+    mova  [dstq+stride4q*1+ 0], m0
+    mova  [dstq+stride4q*1+16], m3
+    mova  [dstq+stride4q*1+32], m2
+    mova  [dstq+stride4q*1+48], m5
+    mova  [dstq+stride4q*2+ 0], m3
+    mova  [dstq+stride4q*2+16], m2
+    mova  [dstq+stride4q*2+32], m5
+    mova  [dstq+stride4q*2+48], m4
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+%else
+    SCRATCH                  6,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 10, rsp+3*mmsize
+%endif
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    UNSCRATCH                6,  9, rsp+2*mmsize
+    SCRATCH                  0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+3*mmsize
+    SCRATCH                  3, 10, rsp+3*mmsize
+%endif
+%endif
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+0*mmsize], m1
+    mova                    m1, [rsp+1*mmsize]
+%endif
+    mova  [dstq+stride3q*4+ 0], m2
+    mova  [dstq+stride3q*4+16], m5
+    mova  [dstq+stride3q*4+32], m4
+    mova  [dstq+stride3q*4+48], m7
+    mova  [dstq+stride4q*4+ 0], m5
+    mova  [dstq+stride4q*4+16], m4
+    mova  [dstq+stride4q*4+32], m7
+    mova  [dstq+stride4q*4+48], m6
+    mova  [dstq+stride20q + 0], m4
+    mova  [dstq+stride20q +16], m7
+    mova  [dstq+stride20q +32], m6
+    mova  [dstq+stride20q +48], m1
+    mova  [dstq+stride3q*8+ 0], m7
+    mova  [dstq+stride3q*8+16], m6
+    mova  [dstq+stride3q*8+32], m1
+    mova  [dstq+stride3q*8+48], m1
+    mova  [dstq+stride28q + 0], m6
+    mova  [dstq+stride28q +16], m1
+    mova  [dstq+stride28q +32], m1
+    mova  [dstq+stride28q +48], m1
+%if cpuflag(avx)
+    vpalignr                m2, m5, m2, 4
+    vpalignr                m5, m4, m5, 4
+    vpalignr                m4, m7, m4, 4
+    vpalignr                m7, m6, m7, 4
+    vpalignr                m6, m1, m6, 4
+%else
+    PALIGNR                 m0, m5, m2, 4, m3
+    mova                    m2, m0
+    PALIGNR                 m0, m4, m5, 4, m3
+    mova                    m5, m0
+    PALIGNR                 m0, m7, m4, 4, m3
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m7, 4, m3
+    mova                    m7, m0
+    PALIGNR                 m0, m1, m6, 4, m3
+    mova                    m6, m0
+    UNSCRATCH                0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3, 10, rsp+3*mmsize
+%endif
+%endif
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_FUNCS 4
+INIT_XMM ssse3
+HU_FUNCS 3
+INIT_XMM avx
+HU_FUNCS 2
+
+%macro HD_FUNCS 0
+cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
+    movh                    m0, [lq]
+    movhps                  m0, [aq-2]
+    psrldq                  m1, m0, 2
+    psrldq                  m2, m0, 4
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    punpcklwd               m1, m2
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m1
+    movhps    [dstq+strideq*1], m1
+    movhlps                 m2, m2
+    PALIGNR                 m2, m1, 4, m0
+    movh      [dstq+strideq*2], m2
+    movhps    [dstq+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m1, [aq-2]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+    SBUTTERFLY           wd, 2,  3,  0
+    psrldq                  m0, m1,  2
+    psrldq                  m4, m1,  4
+    LOWPASS                  1,  0,  4
+    DEFINE_ARGS dst8, mstride, cnt
+    lea                  dst8q, [dst8q+mstrideq*8]
+    neg               mstrideq
+    mov                   cntd, 4
+
+.loop:
+    add                  dst8q, mstrideq
+    mova    [dst8q+mstrideq*0], m2
+    mova    [dst8q+mstrideq*4], m3
+%if cpuflag(avx)
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m1, m3, 4
+%else
+    PALIGNR                 m0, m3, m2, 4, m4
+    mova                    m2, m0
+    PALIGNR                 m0, m1, m3, 4, m4
+    mova                    m3, m0
+%endif
+    psrldq                  m1, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
+    mova                    m2, [lq]
+    movu                    m1, [lq+2]
+    movu                    m0, [lq+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    mova                    m4, [lq+mmsize]
+    movu                    m5, [aq-2]
+    PALIGNR                 m3, m5, m4, 2, m6
+    PALIGNR                 m2, m5, m4, 4, m6
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 1,  0,  4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [aq]
+    movu                    m4, [aq+2]
+    LOWPASS                  4,  6,  5
+    movu                    m5, [aq+mmsize-2]
+    psrldq                  m6, m5,  2
+    psrldq                  m7, m5,  4
+    LOWPASS                  5,  6,  7
+    DEFINE_ARGS dst, mstride, mstride3, cnt
+    lea                   dstq, [dstq+mstrideq*8]
+    lea                   dstq, [dstq+mstrideq*8]
+    neg               mstrideq
+    lea              mstride3q, [mstrideq*3]
+    mov                   cntd, 4
+
+.loop:
+    add                  dstq, mstrideq
+    mova [dstq+mstride3q*4+ 0], m2
+    mova [dstq+mstride3q*4+16], m4
+    mova [dstq+mstrideq *8+ 0], m3
+    mova [dstq+mstrideq *8+16], m2
+    mova [dstq+mstrideq *4+ 0], m0
+    mova [dstq+mstrideq *4+16], m3
+    mova [dstq+mstrideq *0+ 0], m1
+    mova [dstq+mstrideq *0+16], m0
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+    vpalignr                m2, m4, m2, 4
+    vpalignr                m4, m5, m4, 4
+%else
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m4, m2, 4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m5, m4, 4, m7
+    mova                    m4, m6
+%endif
+    psrldq                  m5, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
+                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    SCRATCH                  0,  8, rsp+0*mmsize
+    SCRATCH                  1,  9, rsp+1*mmsize
+    SCRATCH                  2, 10, rsp+2*mmsize
+    SCRATCH                  3, 11, rsp+3*mmsize
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m0, [lq+mmsize*3+0]
+    movu                    m1, [aq+mmsize*0-2]
+    PALIGNR                 m7, m1, m0, 2, m2
+    PALIGNR                 m6, m1, m0, 4, m2
+    LOWPASS                  6,  7,  0
+    pavgw                   m7, m0
+    SBUTTERFLY           wd, 7,  6,  0
+    mova                    m2, [aq+mmsize*0+0]
+    movu                    m0, [aq+mmsize*0+2]
+    LOWPASS                  0,  2,  1
+    movu                    m1, [aq+mmsize*1-2]
+    mova                    m2, [aq+mmsize*1+0]
+    movu                    m3, [aq+mmsize*1+2]
+    LOWPASS                  1,  2,  3
+    SCRATCH                  6, 12, rsp+6*mmsize
+    SCRATCH                  7, 13, rsp+7*mmsize
+    movu                    m2, [aq+mmsize*2-2]
+    mova                    m3, [aq+mmsize*2+0]
+    movu                    m6, [aq+mmsize*2+2]
+    LOWPASS                  2,  3,  6
+    movu                    m3, [aq+mmsize*3-2]
+    psrldq                  m6, m3,  2
+    psrldq                  m7, m3,  4
+    LOWPASS                  3,  6,  7
+    UNSCRATCH                6, 12, rsp+6*mmsize
+    UNSCRATCH                7, 13, rsp+7*mmsize
+%if ARCH_X86_32
+    mova        [rsp+4*mmsize], m4
+    mova        [rsp+5*mmsize], m5
+    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
+    ; to do it again here
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    mov                   cntd, 4
+    lea               stride3q, [strideq*3]
+%if ARCH_X86_64
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+%endif
+    add                   dstq, stride3q
+
+    ; x86-32 doesn't have enough registers, so on that platform, we split
+    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
+.loop:
+%if ARCH_X86_64
+    mova  [dstq+stride28q + 0], m9
+    mova  [dstq+stride28q +16], m8
+    mova  [dstq+stride28q +32], m11
+    mova  [dstq+stride28q +48], m10
+    mova  [dstq+stride3q*8+ 0], m8
+    mova  [dstq+stride3q*8+16], m11
+    mova  [dstq+stride3q*8+32], m10
+    mova  [dstq+stride3q*8+48], m5
+    mova  [dstq+stride20q + 0], m11
+    mova  [dstq+stride20q +16], m10
+    mova  [dstq+stride20q +32], m5
+    mova  [dstq+stride20q +48], m4
+    mova  [dstq+stride4q*4+ 0], m10
+    mova  [dstq+stride4q*4+16], m5
+    mova  [dstq+stride4q*4+32], m4
+    mova  [dstq+stride4q*4+48], m7
+%endif
+    mova  [dstq+stride3q*4+ 0], m5
+    mova  [dstq+stride3q*4+16], m4
+    mova  [dstq+stride3q*4+32], m7
+    mova  [dstq+stride3q*4+48], m6
+    mova  [dstq+strideq* 8+ 0], m4
+    mova  [dstq+strideq* 8+16], m7
+    mova  [dstq+strideq* 8+32], m6
+    mova  [dstq+strideq* 8+48], m0
+    mova  [dstq+strideq* 4+ 0], m7
+    mova  [dstq+strideq* 4+16], m6
+    mova  [dstq+strideq* 4+32], m0
+    mova  [dstq+strideq* 4+48], m1
+    mova  [dstq+strideq* 0+ 0], m6
+    mova  [dstq+strideq* 0+16], m0
+    mova  [dstq+strideq* 0+32], m1
+    mova  [dstq+strideq* 0+48], m2
+    sub                   dstq, strideq
+%if cpuflag(avx)
+%if ARCH_X86_64
+    vpalignr                m9, m8,  m9,  4
+    vpalignr                m8, m11, m8,  4
+    vpalignr               m11, m10, m11, 4
+    vpalignr               m10, m5,  m10, 4
+%endif
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+    vpalignr                m6, m0,  m6,  4
+    vpalignr                m0, m1,  m0,  4
+    vpalignr                m1, m2,  m1,  4
+    vpalignr                m2, m3,  m2,  4
+%else
+%if ARCH_X86_64
+    PALIGNR                m12, m8,  m9,  4, m13
+    mova                    m9, m12
+    PALIGNR                m12, m11, m8,  4, m13
+    mova                    m8, m12
+    PALIGNR                m12, m10, m11, 4, m13
+    mova                   m11, m12
+    PALIGNR                m12, m5,  m10, 4, m13
+    mova                   m10, m12
+%endif
+    SCRATCH                  3, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  2, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m4,  m5,  4, m2
+    mova                    m5, m3
+    PALIGNR                 m3, m7,  m4,  4, m2
+    mova                    m4, m3
+    PALIGNR                 m3, m6,  m7,  4, m2
+    mova                    m7, m3
+    PALIGNR                 m3, m0,  m6,  4, m2
+    mova                    m6, m3
+    PALIGNR                 m3, m1,  m0,  4, m2
+    mova                    m0, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                2, 13, rsp+9*mmsize
+    SCRATCH                  0, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m2,  m1,  4, m0
+    mova                    m1, m3
+    PALIGNR                 m3, reg_sh,  m2,  4, m0
+    mova                    m2, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                0, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                3, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m3, 4
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                0,  8, rsp+0*mmsize
+    UNSCRATCH                1,  9, rsp+1*mmsize
+    UNSCRATCH                2, 10, rsp+2*mmsize
+    UNSCRATCH                3, 11, rsp+3*mmsize
+    mova                    m4, [rsp+4*mmsize]
+    mova                    m5, [rsp+5*mmsize]
+    mova                    m6, [rsp+6*mmsize]
+    mova                    m7, [rsp+7*mmsize]
+    DEFINE_ARGS dst, stride, stride5, stride3
+    lea               stride5q, [strideq*5]
+    lea                   dstq, [dstq+stride5q*4]
+    DEFINE_ARGS dst, stride, cnt, stride3
+    mov                   cntd, 4
+.loop_2:
+    mova  [dstq+stride3q*4+ 0], m1
+    mova  [dstq+stride3q*4+16], m0
+    mova  [dstq+stride3q*4+32], m3
+    mova  [dstq+stride3q*4+48], m2
+    mova  [dstq+strideq* 8+ 0], m0
+    mova  [dstq+strideq* 8+16], m3
+    mova  [dstq+strideq* 8+32], m2
+    mova  [dstq+strideq* 8+48], m5
+    mova  [dstq+strideq* 4+ 0], m3
+    mova  [dstq+strideq* 4+16], m2
+    mova  [dstq+strideq* 4+32], m5
+    mova  [dstq+strideq* 4+48], m4
+    mova  [dstq+strideq* 0+ 0], m2
+    mova  [dstq+strideq* 0+16], m5
+    mova  [dstq+strideq* 0+32], m4
+    mova  [dstq+strideq* 0+48], m7
+    sub                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m0,  m1,  4
+    vpalignr                m0, m3,  m0,  4
+    vpalignr                m3, m2,  m3,  4
+    vpalignr                m2, m5,  m2,  4
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+%else
+    SCRATCH                  6, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m0,  m1,  4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3,  m0,  4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2,  m3,  4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m5,  m2,  4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m4,  m5,  4, m7
+    mova                    m5, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 13, rsp+9*mmsize
+    SCRATCH                  5, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m7,  m4,  4, m5
+    mova                    m4, m6
+    PALIGNR                 m6, reg_sh,  m7,  4, m5
+    mova                    m7, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                5, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                6, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m6, 4
+    dec                   cntd
+    jg .loop_2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_FUNCS
+INIT_XMM ssse3
+HD_FUNCS
+INIT_XMM avx
+HD_FUNCS
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000000..2c63fe514a
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,3197 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA 32
+
+%macro VP9_IDCT_COEFFS 2-3 0
+const pw_m%1_%2
+times 8 dw -%1,  %2
+const pw_%2_%1
+times 8 dw  %2,  %1
+
+%if %3 == 1
+const pw_m%2_m%1
+times 8 dw -%2, -%1
+%if %1 != %2
+const pw_m%2_%1
+times 8 dw -%2,  %1
+const pw_%1_%2
+times 8 dw  %1,  %2
+%endif
+%endif
+
+%if %1 < 11585
+pw_m%1x2:   times 16 dw -%1*2
+%elif %1 > 11585
+pw_%1x2:    times 16 dw  %1*2
+%else
+const pw_%1x2
+times 16 dw %1*2
+%endif
+
+%if %2 != %1
+pw_%2x2:    times 16 dw  %2*2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 16364,   804
+VP9_IDCT_COEFFS 16305,  1606
+VP9_IDCT_COEFFS 16069,  3196, 1
+VP9_IDCT_COEFFS 15893,  3981
+VP9_IDCT_COEFFS 15137,  6270, 1
+VP9_IDCT_COEFFS 14811,  7005
+VP9_IDCT_COEFFS 14449,  7723
+VP9_IDCT_COEFFS 13160,  9760
+VP9_IDCT_COEFFS 11585, 11585, 1
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS  9102, 13623, 1
+VP9_IDCT_COEFFS  8423, 14053
+VP9_IDCT_COEFFS  5520, 15426
+VP9_IDCT_COEFFS  4756, 15679
+VP9_IDCT_COEFFS  2404, 16207
+
+const pw_5283_13377
+times 4 dw 5283, 13377
+const pw_9929_13377
+times 4 dw 9929, 13377
+const pw_15212_m13377
+times 4 dw 15212, -13377
+const pw_15212_9929
+times 4 dw 15212, 9929
+const pw_m5283_m15212
+times 4 dw -5283, -15212
+const pw_13377x2
+times 8 dw 13377*2
+const pw_m13377_13377
+times 4 dw -13377, 13377
+const pw_13377_0
+times 4 dw 13377, 0
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+cextern pd_8192
+
+SECTION .text
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+    punpckhwd          m%4, m%2, m%1
+    punpcklwd          m%2, m%1
+    pmaddwd            m%3, m%4, [pw_m%5_%6]
+    pmaddwd            m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+    SUMSUB_BA            d, %1, %2, %5
+    SUMSUB_BA            d, %3, %4, %5
+    paddd              m%1, %6
+    paddd              m%2, %6
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%1, 14
+    psrad              m%2, 14
+    psrad              m%3, 14
+    psrad              m%4, 14
+    packssdw           m%1, m%3
+    packssdw           m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+%if mmsize == 32
+    pmovzxbw           m%3, [%6]
+    pmovzxbw           m%4, [%6+strideq]
+%else
+    movh               m%3, [%6]
+    movh               m%4, [%6+strideq]
+    punpcklbw          m%3, m%5
+    punpcklbw          m%4, m%5
+%endif
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+%if mmsize == 32
+    packuswb           m%3, m%4
+    ; Intel...
+    vpermq             m%3, m%3, q3120
+    mova              [%6], xm%3
+    vextracti128 [%6+strideq], m%3, 1
+%elif mmsize == 16
+    packuswb           m%3, m%4
+    movh              [%6], m%3
+    movhps    [%6+strideq], m%3
+%else
+    packuswb           m%3, m%5
+    packuswb           m%4, m%5
+    movh              [%6], m%3
+    movh      [%6+strideq], m%4
+%endif
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+    mova                m0, [blockq+0*8]
+    mova                m1, [blockq+1*8]
+    mova                m2, [blockq+2*8]
+    mova                m3, [blockq+3*8]
+    psraw               m0, 2
+    psraw               m1, 2
+    psraw               m2, 2
+    psraw               m3, 2
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m4, m4
+    VP9_STORE_2X         0, 1, 5, 6, 4
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 5, 6, 4
+    ZERO_BLOCK      blockq, 8, 4, m4
+    RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+    pmulhrsw            m0, m5                              ; m0=t1
+    mova                m2, m0                              ; m2=t0
+    mova                m3, m1
+    pmulhrsw            m1, m6                              ; m1=t2
+    pmulhrsw            m3, m7                              ; m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    psraw               m0, 4
+    psraw               m1, 4
+%endif
+    VP9_STORE_2X         0,  1,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%if cpuflag(ssse3)
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    VP9_STORE_2X         2,  3,  6,  7,  4
+%endmacro
+
+%macro IDCT_4x4_FN 1
+INIT_MMX %1
+cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+    cmp eobd, 4 ; 2x2 or smaller
+    jg .idctfull
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct2x2
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (8 << 14) + 8192
+    sar              coefd, 14 + 4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    pxor                m4, m4
+    movh          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+8]
+    mova                m5, [pw_11585x2]
+    mova                m6, [pw_6270x2]
+    mova                m7, [pw_15137x2]
+    VP9_IDCT4_2x2_1D
+    ; partial 2x4 transpose
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    SBUTTERFLY          dq, 0, 2, 1
+    SWAP                1, 2
+    VP9_IDCT4_2x2_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movh       [blockq+ 0], m4
+    movh       [blockq+ 8], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endif
+
+.idctfull: ; generic full 4x4 idct/idct
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IDCT_4x4_FN mmxext
+IDCT_4x4_FN ssse3
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
+IADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
+IADST4_FN iadst, IADST4, iadst, IADST4, sse2
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova              [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
+    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
+    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
+
+    UNSCRATCH            5, 8, blockq+ 0
+    SCRATCH              2, 8, blockq+ 0
+
+    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+%endmacro
+
+; x86-32
+; - in: m0/m4 is in mem
+; - out: m6 is in mem
+; x86-64:
+; - everything is in registers (m0-7)
+%macro VP9_IDCT8_1D 0
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 4, 9
+%endif
+
+    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
+    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
+    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+%if cpuflag(ssse3)
+    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+%else
+    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
+
+    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
+    UNSCRATCH            4, 9, blockq+64    ; IN(4)
+    SCRATCH              5, 8, blockq+ 0
+
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
+    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
+%else
+    SCRATCH              7, 9, blockq+64
+    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
+    UNSCRATCH            7, 9, blockq+64
+%endif
+    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
+    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
+    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
+    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
+    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
+    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
+    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
+    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
+    paddw               m6, m0                              ; m6=t0a+t3a (t0)
+    SCRATCH              5,  8, blockq+ 0
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_2x2_1D 1
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
+    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
+    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
+    psubw               m7, m3, m1                          ; t5 = t7a - t4a
+    paddw               m5, m3, m1                          ; t6 = t7a + t4a
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
+    SWAP                 5,  1
+    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
+    psubw               m6, m0, m3                          ; m6=t0-t7
+    paddw               m3, m0                              ; m3=t0+t7
+    psubw               m2, m0, m1                          ; m2=t1-t6
+    paddw               m1, m0                              ; m1=t1+t6
+%if %1 == 1
+    punpcklwd           m3, m1
+%define SCRATCH_REG 1
+%elif ARCH_X86_32
+    mova       [blockq+ 0], m2
+%define SCRATCH_REG 2
+%else
+%define SCRATCH_REG 8
+%endif
+    psubw               m4, m0, m5                          ; m4=t3-t4
+    paddw               m5, m0                              ; m5=t3+t4
+    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+%undef SCRATCH_REG
+%endmacro
+
+%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
+%if cpuflag(ssse3)
+    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+    pmulhrsw           m%2, %6
+%else
+    paddw              m%1, %6
+    paddw              m%2, %6
+    psraw              m%1, %7
+    psraw              m%2, %7
+%endif
+%if %0 <= 7
+    VP9_STORE_2X        %1, %2, %3, %4, %5
+%else
+    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
+%endif
+%endmacro
+
+; x86-32:
+; - m6 is in mem
+; x86-64:
+; - m8 holds m6 (SWAP)
+; m6 holds zero
+%macro VP9_IDCT8_WRITEOUT 0
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova                m9, [pw_1024]
+%else
+    mova                m9, [pw_16]
+%endif
+%define ROUND_REG m9
+%else
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_1024]
+%else
+%define ROUND_REG [pw_16]
+%endif
+%endif
+    SCRATCH              5, 10, blockq+16
+    SCRATCH              7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 10, blockq+16
+    UNSCRATCH            7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 8, blockq+ 0
+    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
+
+%undef ROUND_REG
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova               m12, [pw_11585x2]    ; often used
+%define W_11585x2_REG m12
+%else
+%define W_11585x2_REG [pw_11585x2]
+%endif
+
+    cmp eobd, 12 ; top left half or less
+    jg .idctfull
+
+    cmp eobd, 3  ; top left corner or less
+    jg .idcthalf
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idcttopleftcorner
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    pmulhrsw            m0, W_11585x2_REG
+    pmulhrsw            m0, W_11585x2_REG
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (16 << 14) + 8192
+    sar              coefd, 14 + 5
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, 0
+    pxor                m4, m4
+    movd          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+%endif
+%rep 3
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+16]
+%if ARCH_X86_64
+    mova               m10, [pw_3196x2]
+    mova               m11, [pw_16069x2]
+%define W_3196x2_REG m10
+%define W_16069x2_REG m11
+%else
+%define W_3196x2_REG [pw_3196x2]
+%define W_16069x2_REG [pw_16069x2]
+%endif
+    VP9_IDCT8_2x2_1D 1
+    ; partial 2x8 transpose
+    ; punpcklwd m0, m1 already done inside idct
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    punpckldq           m0, m2
+    punpckldq           m4, m6
+    SBUTTERFLY         qdq, 0, 4, 1
+    SWAP                 1, 4
+    VP9_IDCT8_2x2_1D 2
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movd       [blockq+ 0], m6
+    movd       [blockq+16], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    RET
+
+.idcthalf:
+    movh                m0, [blockq + 0]
+    movh                m1, [blockq +16]
+    movh                m2, [blockq +32]
+    movh                m3, [blockq +48]
+    VP9_IDCT8_4x4_1D
+    ; partial 4x8 transpose
+%if ARCH_X86_32
+    mova                m6, [blockq+ 0]
+%endif
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    SBUTTERFLY          dq, 0, 2, 1
+    SBUTTERFLY          dq, 4, 6, 5
+    SBUTTERFLY         qdq, 0, 4, 1
+    SBUTTERFLY         qdq, 2, 6, 5
+    SWAP                 1, 4
+    SWAP                 3, 6
+    VP9_IDCT8_4x4_1D
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movh       [blockq+ 0], m6
+    movh       [blockq+16], m6
+    movh       [blockq+32], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    movh       [blockq+48], m6
+    RET
+%endif
+
+.idctfull: ; generic full 8x8 idct/idct
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+    mova                m3, [blockq+ 48]    ; IN(3)
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+    mova                m7, [blockq+112]    ; IN(7)
+%if ARCH_X86_64
+    mova               m11, [pd_8192]       ; rounding
+%define D_8192_REG m11
+%else
+%define D_8192_REG [pd_8192]
+%endif
+    VP9_IDCT8_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova        [blockq+0], m0
+%endif
+    VP9_IDCT8_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+%undef W_11585x2_REG
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
+VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-32:
+; - in: m0/3/4/7 are in mem [blockq+N*16]
+; - out: m6 is in mem [blockq+0]
+; x86-64:
+; - everything is in registers
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     3, 9
+    SWAP                     4, 10
+    SWAP                     7, 11
+%endif
+
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
+    SCRATCH                  4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
+    UNSCRATCH                4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
+
+    UNSCRATCH                0,  8, blockq+16*0
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                7, 11, blockq+16*7
+    SCRATCH                  1,  8, blockq+16*1
+    SCRATCH                  2,  9, blockq+16*2
+    SCRATCH                  5, 10, blockq+16*5
+    SCRATCH                  6, 11, blockq+16*6
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
+    UNSCRATCH                1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
+
+    UNSCRATCH                2,  9, blockq+16*2
+    UNSCRATCH                5, 10, blockq+16*5
+    SCRATCH                  3,  9, blockq+16*3
+    SCRATCH                  4, 10, blockq+16*4
+
+    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
+    UNSCRATCH                1, 12, blockq+ 0*16
+    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
+
+    UNSCRATCH                1,  8, blockq+16*1
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                6, 11, blockq+16*6
+    SCRATCH                  2,  8, blockq+16*0
+
+    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
+    SUMSUB_BA                w,  1,  3, 2
+    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
+
+    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  3,  4,  2
+    SUMSUB_BA                w,  0,  7,  2
+    pmulhrsw                m3, W_11585x2_REG
+    pmulhrsw                m7, W_11585x2_REG
+    pmulhrsw                m4, W_11585x2_REG               ; out4
+    pmulhrsw                m0, W_11585x2_REG               ; out2
+%else
+    SCRATCH                  5,  9, blockq+16*1
+    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
+    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
+    UNSCRATCH                5,  9, blockq+16*1
+%endif
+    PSIGNW                  m3, W_M1_REG                    ; out3
+    PSIGNW                  m7, W_M1_REG                    ; out5
+
+    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
+
+%if ARCH_X86_64
+    SWAP                     2, 8
+%endif
+    SWAP                     0, 6, 2
+    SWAP                     7, 1, 5
+%endmacro
+
+%macro IADST8_FN 6
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
+
+%ifidn %1, idct
+%define first_is_idct 1
+%else
+%define first_is_idct 0
+%endif
+
+%ifidn %3, idct
+%define second_is_idct 1
+%else
+%define second_is_idct 0
+%endif
+
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+%if ARCH_X86_64 || first_is_idct
+    mova                m3, [blockq+ 48]    ; IN(3)
+%endif
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+%if ARCH_X86_64 || first_is_idct
+    mova                m7, [blockq+112]    ; IN(7)
+%endif
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova               m15, [pw_11585x2]    ; often used
+%endif
+    mova               m13, [pd_8192]       ; rounding
+    mova               m14, [pw_m1]
+%define W_11585x2_REG m15
+%define D_8192_REG m13
+%define W_M1_REG m14
+%else
+%define W_11585x2_REG [pw_11585x2]
+%define D_8192_REG [pd_8192]
+%define W_M1_REG [pw_m1]
+%endif
+
+    ; note different calling conventions for idct8 vs. iadst8 on x86-32
+    VP9_%2_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova      [blockq+  0], m0
+%if second_is_idct == 0
+    mova      [blockq+ 48], m3
+    mova      [blockq+112], m7
+%endif
+%endif
+    VP9_%4_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+
+%undef W_11585x2_REG
+%undef first_is_idct
+%undef second_is_idct
+
+%endmacro
+
+IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
+IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
+IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
+IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
+IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-64:
+; at the end of this macro, m7 is stored in [%4+15*%5]
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
+; and the following simsubs have not been done yet:
+;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
+;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
+
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
+%if %2 <= 4
+    mova                m3, [%1+ 1*%3]      ; IN(1)
+    mova                m0, [%1+ 3*%3]      ; IN(3)
+
+    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
+    pmulhrsw            m3, [pw_1606x2]             ; t8-9
+    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
+    pmulhrsw            m0, [pw_15679x2]            ; t12-13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+    UNSCRATCH            5, 11, %4+ 7*%5
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+    mova                m5, [%1+ 1*%3]      ; IN(1)
+    mova                m4, [%1+ 7*%3]      ; IN(7)
+%if %2 <= 8
+    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
+    pmulhrsw            m5, [pw_1606x2]             ; t8
+    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
+    pmulhrsw            m4, [pw_12665x2]            ; t14
+%else
+    mova                m3, [%1+ 9*%3]      ; IN(9)
+    mova                m2, [%1+15*%3]      ; IN(15)
+
+    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
+%endif
+
+    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
+    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
+
+    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
+
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+
+    mova                m6, [%1+ 3*%3]      ; IN(3)
+    mova                m7, [%1+ 5*%3]      ; IN(5)
+%if %2 <= 8
+    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
+    pmulhrsw            m7, [pw_7723x2]             ; t10
+    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
+    pmulhrsw            m6, [pw_15679x2]            ; t12
+%else
+    mova                m0, [%1+11*%3]      ; IN(11)
+    mova                m1, [%1+13*%3]      ; IN(13)
+
+    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
+%endif
+
+    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
+    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+
+    UNSCRATCH            5, 11, %4+ 7*%5
+%endif
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
+
+    ; backup first register
+    mova        [%4+15*%5], m7
+
+    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
+    UNSCRATCH            4, 10, %4+ 1*%5
+    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
+    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  2,  5, 7
+    SUMSUB_BA            w,  3,  4, 7
+    pmulhrsw            m5, [pw_11585x2]    ; t10
+    pmulhrsw            m4, [pw_11585x2]    ; t11
+    pmulhrsw            m3, [pw_11585x2]    ; t12
+    pmulhrsw            m2, [pw_11585x2]    ; t13
+%else
+    SCRATCH              6, 10, %4+ 1*%5
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
+    UNSCRATCH            6, 10, %4+ 1*%5
+%endif
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+    SCRATCH              0,  8, %4+ 1*%5
+    SCRATCH              1,  9, %4+ 3*%5
+    SCRATCH              2, 10, %4+ 5*%5
+    SCRATCH              3, 11, %4+ 7*%5
+    SCRATCH              4, 12, %4+ 9*%5
+    SCRATCH              5, 13, %4+11*%5
+    SCRATCH              6, 14, %4+13*%5
+
+    ; even (tx8x8)
+%if %2 <= 4
+    mova                m3, [%1+ 0*%3]      ; IN(0)
+    mova                m4, [%1+ 2*%3]      ; IN(2)
+
+    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
+    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
+    pmulhrsw            m4, [pw_3196x2]             ; t4-5
+
+%if 0 ; overflows :(
+    paddw               m6, m7, m4
+    psubw               m5, m7, m4
+    pmulhrsw            m5, [pw_11585x2]            ; t5
+    pmulhrsw            m6, [pw_11585x2]            ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
+%endif
+
+    psubw               m0, m3, m7
+    paddw               m7, m3
+    psubw               m1, m3, m6
+    paddw               m6, m3
+    psubw               m2, m3, m5
+    paddw               m5, m3
+
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+%else
+    mova                m6, [%1+ 2*%3]      ; IN(2)
+    mova                m1, [%1+ 4*%3]      ; IN(4)
+    mova                m7, [%1+ 6*%3]      ; IN(6)
+%if %2 <= 8
+    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
+    pmulhrsw            m1, [pw_6270x2]             ; t2
+    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
+    pmulhrsw            m6, [pw_3196x2]             ; t4
+    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
+    pmulhrsw            m7, [pw_13623x2]            ; t6
+%else
+    mova                m4, [%1+10*%3]      ; IN(10)
+    mova                m0, [%1+12*%3]      ; IN(12)
+    mova                m5, [%1+14*%3]      ; IN(14)
+
+    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
+    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  6,  5, 2
+    pmulhrsw            m5, [pw_11585x2]                              ; t5
+    pmulhrsw            m6, [pw_11585x2]                              ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SCRATCH              5, 15, %4+10*%5
+    mova                m2, [%1+ 0*%3]      ; IN(0)
+%if %2 <= 8
+    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
+    psubw               m3, m2, m0
+    paddw               m0, m2
+
+    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
+%else
+    mova                m3, [%1+ 8*%3]      ; IN(8)
+
+    ; from 3 stages back
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  3,  2, 5
+    pmulhrsw            m3, [pw_11585x2]    ; t0
+    pmulhrsw            m2, [pw_11585x2]    ; t1
+%else
+    mova        [%1+ 0*%3], m0
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
+    mova                m0, [%1+ 0*%3]
+%endif
+
+    ; from 2 stages back
+    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
+
+    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
+%endif
+    UNSCRATCH            5, 15, %4+10*%5
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
+
+    ; from 1 stage back
+    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
+    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
+%endif
+    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 1, 9
+    SWAP                 2, 10
+    SWAP                 3, 11
+    SWAP                 4, 12
+    SWAP                 5, 13
+    SWAP                 6, 14
+
+    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
+    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
+    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
+    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
+    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
+    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
+%else
+    SWAP                 1, 6
+    SWAP                 2, 5
+    SWAP                 3, 4
+    mova        [%4+14*%5], m6
+
+%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
+    mova                m6, [%4+%2*%5]
+    SUMSUB_BA            w,  6, %1, 7
+    SWAP                %1, 6
+    mova        [%4+%3*%5], m6
+%endmacro
+
+    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
+    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
+    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
+    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
+    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
+    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
+%endif
+%endmacro
+
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
+%if %2 == 1
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
+
+%if ARCH_X86_64
+    ; backup a different register
+    mova                m7, [tmpq+15*16]
+    mova      [tmpq+ 1*16], m15
+
+    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
+    mova        [tmpq+  0], m0
+    mova        [tmpq+ 32], m1
+    mova        [tmpq+ 64], m2
+    mova        [tmpq+ 96], m3
+    mova        [tmpq+128], m4
+    mova        [tmpq+160], m5
+    mova        [tmpq+192], m6
+    mova        [tmpq+224], m7
+
+    mova               m15, [tmpq+ 1*16]
+    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova        [tmpq+ 16], m8
+    mova        [tmpq+ 48], m9
+    mova        [tmpq+ 80], m10
+    mova        [tmpq+112], m11
+    mova        [tmpq+144], m12
+    mova        [tmpq+176], m13
+    mova        [tmpq+208], m14
+    mova        [tmpq+240], m15
+%else
+    mova                m6, [tmpq+13*16]
+    mova                m7, [tmpq+14*16]
+    SUMSUB_BA            w, 6, 7                ; t6, t9
+    mova      [tmpq+14*16], m6
+    mova      [tmpq+13*16], m7
+    mova                m7, [tmpq+15*16]
+    mova                m6, [tmpq+12*16]
+    SUMSUB_BA            w, 7, 6                ; t7, t8
+    mova      [tmpq+15*16], m6
+
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
+    mova     [tmpq+ 0*16], m0
+    mova     [tmpq+ 2*16], m1
+    mova     [tmpq+ 4*16], m2
+    mova     [tmpq+ 6*16], m3
+    mova     [tmpq+10*16], m5
+    mova     [tmpq+12*16], m6
+    mova     [tmpq+14*16], m7
+
+    mova                m0, [tmpq+15*16]
+    mova                m1, [tmpq+13*16]
+    mova                m2, [tmpq+11*16]
+    mova                m3, [tmpq+ 9*16]
+    mova                m4, [tmpq+ 7*16]
+    mova                m5, [tmpq+ 5*16]
+    mova                m7, [tmpq+ 1*16]
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
+    mova     [tmpq+ 1*16], m0
+    mova     [tmpq+ 3*16], m1
+    mova     [tmpq+ 5*16], m2
+    mova     [tmpq+ 7*16], m3
+    mova     [tmpq+11*16], m5
+    mova     [tmpq+13*16], m6
+    mova     [tmpq+15*16], m7
+%endif
+%else ; %2 == 2
+    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+    pxor                m7, m7
+%if ARCH_X86_64
+    ; backup more registers
+    mova        [%1+ 2*32], m8
+    mova        [%1+ 3*32], m9
+
+    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    ; restore from cache
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m7, [%1+15*32]
+    mova                m8, [%1+ 2*32]
+    mova                m9, [%1+ 3*32]
+
+    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
+    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
+
+    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
+%else
+    mova      [tmpq+ 0*32], m5
+
+    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m5, [tmpq+ 0*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+13*32]
+    mova                m7, [tmpq+14*32]
+    mova                m5, [tmpq+15*32]
+    mova                m6, [tmpq+12*32]
+    SUMSUB_BADC w, 4, 7, 5, 6, 1
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+11*32]
+    mova                m5, [tmpq+ 9*32]
+    mova                m6, [tmpq+ 7*32]
+    mova                m7, [tmpq+ 5*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+ 3*32]
+    mova                m5, [tmpq+ 1*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+%endif
+
+%undef ROUND_REG
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+    mova               m%3, [dstq]
+    mova               m%5, [dstq+%7]
+    punpcklbw          m%2, m%3, m%6
+    punpckhbw          m%3, m%6
+    punpcklbw          m%4, m%5, m%6
+    punpckhbw          m%5, m%6
+    paddw              m%2, m%1
+    paddw              m%3, m%1
+    paddw              m%4, m%1
+    paddw              m%5, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], m%2
+    mova         [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+%if cpuflag(ssse3)
+    ; 2x2=eob=3, 4x4=eob=10
+    cmp eobd, 38
+    jg .idctfull
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct8x8
+%else
+    cmp eobd, 1 ; faster path for when only DC is set
+    jg .idctfull
+%endif
+
+    ; dc-only
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 7
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    RET
+
+    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+%if cpuflag(ssse3)
+.idct8x8:
+    mov               tmpq, rsp
+    VP9_IDCT16_1D   blockq, 1, 8, 0
+
+    mov               cntd, 2
+    mov           dst_bakq, dstq
+.loop2_8x8:
+    VP9_IDCT16_1D     tmpq, 2, 8, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 8, m0
+    RET
+%endif
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT16_1D   blockq, 1, 16, 0
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_IDCT16_1D     tmpq, 2, 16, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM sse2
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+%macro VP9_IDCT16_YMM_1D 0
+    VP9_UNPACK_MULSUB_2W_4X  1,  15, 16305,  1606, [pd_8192], 0, 4 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X  9,   7, 10394, 12665, [pd_8192], 0, 4 ; t9,  t14
+
+    SUMSUB_BA            w,  9,   1, 0      ; t8,  t9
+    SUMSUB_BA            w,  7,  15, 0      ; t15, t14
+
+    VP9_UNPACK_MULSUB_2W_4X 15,   1, 15137,  6270, [pd_8192], 0, 4 ; t9,  t14
+
+    VP9_UNPACK_MULSUB_2W_4X  5,  11, 14449,  7723, [pd_8192], 0, 4 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X 13,   3,  4756, 15679, [pd_8192], 0, 4 ; t11, t12
+
+    SUMSUB_BA            w,  5,  13, 0      ; t11, t10
+    SUMSUB_BA            w, 11,   3, 0      ; t12, t13
+
+    VP9_UNPACK_MULSUB_2W_4X  3,  13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
+
+    SUMSUB_BA            w,  5,   9, 0      ; t8,  t11
+    SUMSUB_BA            w,  3,  15, 0      ; t9,  t10
+    SUMSUB_BA            w, 11,   7, 0      ; t15, t12
+    SUMSUB_BA            w, 13,   1, 0      ; t14, t13
+
+    SUMSUB_BA            w, 15,   1, 0
+    SUMSUB_BA            w,  9,   7, 0
+    pmulhrsw            m1, [pw_11585x2]    ; t10
+    pmulhrsw            m7, [pw_11585x2]    ; t11
+    pmulhrsw            m9, [pw_11585x2]    ; t12
+    pmulhrsw           m15, [pw_11585x2]    ; t13
+
+    ; even (tx8x8)
+    mova                m4, [blockq+128]
+    mova      [blockq+128], m5
+    VP9_UNPACK_MULSUB_2W_4X   4,  12, 15137,  6270, [pd_8192], 0, 5 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X   2,  14, 16069,  3196, [pd_8192], 0, 5 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X  10,   6,  9102, 13623, [pd_8192], 0, 5 ; t5,  t6
+    mova                m0, [blockq+  0]
+    SUMSUB_BA            w,   8,   0, 5
+    pmulhrsw            m8, [pw_11585x2]    ; t0
+    pmulhrsw            m0, [pw_11585x2]    ; t1
+
+    SUMSUB_BA            w,  10,   2, 5     ; t4,  t5
+    SUMSUB_BA            w,   6,  14, 5     ; t7,  t6
+    SUMSUB_BA            w,  12,   8, 5     ; t0,  t3
+    SUMSUB_BA            w,   4,   0, 5     ; t1,  t2
+
+    SUMSUB_BA            w,   2,  14, 5
+    pmulhrsw           m14, [pw_11585x2]    ; t5
+    pmulhrsw            m2, [pw_11585x2]    ; t6
+
+    SUMSUB_BA            w,   6,  12, 5     ; t0,  t7
+    SUMSUB_BA            w,   2,   4, 5     ; t1,  t6
+    SUMSUB_BA            w,  14,   0, 5     ; t2,  t5
+    SUMSUB_BA            w,  10,   8, 5     ; t3,  t4
+
+    ; final stage
+    SUMSUB_BA            w, 11,  6,  5      ; out0, out15
+    SUMSUB_BA            w, 13,  2,  5      ; out1, out14
+    SUMSUB_BA            w, 15, 14,  5      ; out2, out13
+    SUMSUB_BA            w,  9, 10,  5      ; out3, out12
+    SUMSUB_BA            w,  7,  8,  5      ; out4, out11
+    SUMSUB_BA            w,  1,  0,  5      ; out5, out10
+    SUMSUB_BA            w,  3,  4,  5      ; out6, out9
+    mova                m5, [blockq+128]
+    mova      [blockq+192], m3
+    SUMSUB_BA            w,  5, 12,  3      ; out7, out8
+
+    SWAP  0, 11,  8, 12, 10
+    SWAP  1, 13, 14,  2, 15,  6,  3,  9,  4,  7,  5
+%endmacro
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+    mova              xm%2, [dstq]
+    mova              xm%4, [dstq+strideq*2]
+    vinserti128        m%2, m%2, [dstq+strideq], 1
+    vinserti128        m%4, m%4, [dstq+stride3q], 1
+    punpckhbw          m%3, m%2, m%6
+    punpcklbw          m%2, m%6
+    punpckhbw          m%5, m%4, m%6
+    punpcklbw          m%4, m%6
+    paddw              m%3, m%1
+    paddw              m%2, m%1
+    paddw              m%5, m%1
+    paddw              m%4, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], xm%2
+    mova        [dstq+strideq*2], xm%4
+    vextracti128  [dstq+strideq], m%2, 1
+    vextracti128 [dstq+stride3q], m%4, 1
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
+    cmp eobd, 1 ; faster path for when only DC is set
+    jg .idctfull
+
+    ; dc-only
+    mova                m1, [pw_11585x2]
+    vpbroadcastw        m0, [blockq]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+    pxor                m5, m5
+    pmulhrsw            m0, [pw_512]
+    movd          [blockq], xm5
+
+    DEFINE_ARGS dst, stride, stride3, cnt
+    mov               cntd, 4
+    lea           stride3q, [strideq*3]
+.loop_dc:
+    VP9_STORE_YMM_DC_4X  0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+4*strideq]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+    DEFINE_ARGS dst, stride, block, eob
+.idctfull:
+    mova                m1, [blockq+ 32]
+    mova                m2, [blockq+ 64]
+    mova                m3, [blockq+ 96]
+    mova                m5, [blockq+160]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    mova                m8, [blockq+256]
+    mova                m9, [blockq+288]
+    mova               m10, [blockq+320]
+    mova               m11, [blockq+352]
+    mova               m12, [blockq+384]
+    mova               m13, [blockq+416]
+    mova               m14, [blockq+448]
+    mova               m15, [blockq+480]
+
+    VP9_IDCT16_YMM_1D
+    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+                         [blockq+192], [blockq+128], 1
+    mova      [blockq+  0], m0
+    VP9_IDCT16_YMM_1D
+
+    mova      [blockq+224], m7
+
+    ; store
+    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    pxor                m0, m0
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+    mova                m0, [%1+ 0*32]  ; in0
+    mova                m1, [%1+15*32]  ; in15
+    mova                m2, [%1+ 7*32]  ; in7
+    mova                m3, [%1+ 8*32]  ; in8
+
+    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
+    SCRATCH              4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
+    UNSCRATCH            4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    mova   [tmpq+ 7*%%str], m2
+    mova   [tmpq+ 8*%%str], m3
+
+    mova                m1, [%1+ 2*32]  ; in2
+    mova                m0, [%1+13*32]  ; in13
+    mova                m3, [%1+ 5*32]  ; in5
+    mova                m2, [%1+10*32]  ; in10
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
+    SCRATCH              4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
+
+    SCRATCH              0, 12, tmpq+ 2*%%str
+    SCRATCH              1, 13, tmpq+13*%%str
+    mova   [tmpq+ 5*%%str], m2
+    mova   [tmpq+10*%%str], m3
+
+    mova                m2, [%1+ 4*32]  ; in4
+    mova                m3, [%1+11*32]  ; in11
+    mova                m0, [%1+ 3*32]  ; in3
+    mova                m1, [%1+12*32]  ; in12
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
+    SCRATCH              4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
+
+    SCRATCH              0,  8, tmpq+ 4*%%str
+    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
+    UNSCRATCH            0, 10, tmpq+ 0*%%str
+    UNSCRATCH            1, 11, tmpq+15*%%str
+
+    ; round 2 interleaved part 1
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
+    SCRATCH              4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    SCRATCH              2, 14, tmpq+ 3*%%str
+    SCRATCH              3, 15, tmpq+12*%%str
+
+    mova                m2, [%1+ 6*32]  ; in6
+    mova                m3, [%1+ 9*32]  ; in9
+    mova                m0, [%1+ 1*32]  ; in1
+    mova                m1, [%1+14*32]  ; in14
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
+    SCRATCH              4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
+    UNSCRATCH            4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
+
+    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
+    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
+
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    UNSCRATCH            5, 13, tmpq+13*%%str
+    SCRATCH              0, 12, tmpq+ 1*%%str
+    SCRATCH              1, 13, tmpq+14*%%str
+
+    ; remainder of round 2 (rest of t8-15)
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
+
+    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
+
+    UNSCRATCH            6, 14, tmpq+ 3*%%str
+    UNSCRATCH            7, 15, tmpq+12*%%str
+
+    SUMSUB_BA                w,  3,  7,  1
+    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
+    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
+%else
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    mova       [tmpq+ 3*%%str], m6
+    mova       [tmpq+ 6*%%str], m7
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    UNSCRATCH                7, 11, tmpq+15*%%str
+    mova       [tmpq+13*%%str], m2
+    SCRATCH                  3, 11, tmpq+ 9*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
+    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
+%else
+    PSIGNW                  m7, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
+
+    mova                    m2, [tmpq+ 8*%%str]
+    mova                    m3, [tmpq+ 7*%%str]
+    mova                    m1, [tmpq+11*%%str]
+    mova       [tmpq+ 7*%%str], m6
+    mova       [tmpq+11*%%str], m4
+    mova                    m4, [tmpq+ 5*%%str]
+    SCRATCH                  5, 14, tmpq+ 5*%%str
+    SCRATCH                  7, 15, tmpq+ 8*%%str
+    UNSCRATCH                6,  8, tmpq+ 4*%%str
+    UNSCRATCH                5, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+14*%%str
+
+    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
+    mova                    m0, [tmpq+10*%%str]
+    SCRATCH                  1, 12, tmpq+ 1*%%str
+    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
+    SCRATCH                  6, 13, tmpq+ 4*%%str
+    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
+    SCRATCH                  7,  8, tmpq+10*%%str
+    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
+    SCRATCH                  5,  9, tmpq+14*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
+    SCRATCH                  6, 10, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
+    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
+
+    UNSCRATCH                1,  8, tmpq+10*%%str
+    UNSCRATCH                5,  9, tmpq+14*%%str
+    UNSCRATCH                6, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+ 4*%%str
+    SCRATCH                  4,  9, tmpq+14*%%str
+
+    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
+    SUMSUB_BA                w,  5,  7,  4
+    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+%if 0 ; cpuflag(ssse3)
+    SUMSUB_BA               w,   7,  6,  4
+    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
+    SWAP                     6,  7
+    SUMSUB_BA                w,  3,  2,  4
+    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
+    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
+%else
+    SCRATCH                  5,  8, tmpq+10*%%str
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
+    UNSCRATCH                5,  8, tmpq+10*%%str
+%endif
+
+    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+%if %2 == 1
+%if ARCH_X86_64
+    mova                   m13, [tmpq+ 6*%%str]
+    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m11
+    mova          [tmpq+ 4*16], m14
+    mova          [tmpq+ 6*16], m0
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+    mova          [tmpq+ 8*16], m3
+    mova          [tmpq+10*16], m15
+    mova          [tmpq+12*16], m13
+    mova          [tmpq+14*16], m6
+
+    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
+    mova          [tmpq+ 1*16], m7
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m11
+    mova          [tmpq+ 7*16], m2
+    mova          [tmpq+ 9*16], m9
+    mova          [tmpq+11*16], m14
+    mova          [tmpq+13*16], m0
+    mova          [tmpq+15*16], m5
+%else
+    mova       [tmpq+12*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+15*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    mova                    m5, [tmpq+ 5*%%str]
+    mova                    m7, [tmpq+ 8*%%str]
+    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m2
+    mova          [tmpq+ 4*16], m5
+    mova          [tmpq+ 6*16], m0
+    mova          [tmpq+10*16], m7
+    mova                    m3, [tmpq+12*%%str]
+    mova          [tmpq+12*16], m4
+    mova                    m4, [tmpq+14*%%str]
+    mova          [tmpq+14*16], m6
+
+    mova                    m0, [tmpq+15*%%str]
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                    m2, [tmpq+ 7*%%str]
+    mova                    m5, [tmpq+11*%%str]
+    mova                    m7, [tmpq+ 1*%%str]
+    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
+    mova          [tmpq+ 1*16], m0
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m2
+    mova          [tmpq+ 7*16], m3
+    mova          [tmpq+11*16], m5
+    mova          [tmpq+13*16], m6
+    mova          [tmpq+15*16], m7
+%endif
+%else
+    pxor                    m4, m4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%if ARCH_X86_64
+    mova                   m12, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+
+    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
+%else
+    mova       [tmpq+ 0*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+ 2*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 5*%%str]
+    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 8*%%str]
+    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m0, [tmpq+ 2*%%str]
+    mova                    m3, [tmpq+ 3*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+ 7*%%str]
+    mova                    m3, [tmpq+ 0*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+14*%%str]
+    mova                    m3, [tmpq+11*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+13*%%str]
+    mova                    m3, [tmpq+ 1*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+%endif
+
+    SWAP                     0,  4 ; zero
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_%2_1D       blockq, 1
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_%4_1D         tmpq, 2
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
+IADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
+IADST16_FN iadst, IADST16, iadst, IADST16, sse2
+IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
+; out: m[0-15] except m6, which is in [blockq+192]
+; uses blockq as scratch space
+%macro VP9_IADST16_YMM_1D 0
+    mova          [blockq+ 32], m3
+    mova          [blockq+ 64], m7
+    mova          [blockq+ 96], m8
+
+    ; first half of round 1
+    VP9_UNPACK_MULSUB_2D_4X  9,  6,  0,  3, 13160,  9760    ; m9/x=t7[d], m6/x=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  1, 14,  4,  7,  2404, 16207    ; m1/x=t15[d], m14/x=t14[d]
+    VP9_RND_SH_SUMSUB_BA    14,  6,  7,  3,  8, [pd_8192]   ; m14=t6[w], m6=t14[w]
+    VP9_RND_SH_SUMSUB_BA     1,  9,  4,  0,  8, [pd_8192]   ; m1=t7[w], m9=t15[w]
+
+    VP9_UNPACK_MULSUB_2D_4X 13,  2,  4,  7, 15893,  3981    ; m13/x=t3[d], m2/x=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  5, 10,  0,  3,  8423, 14053    ; m5/x=t11[d], m10/x=t10[d]
+    VP9_RND_SH_SUMSUB_BA    10,  2,  3,  7,  8, [pd_8192]   ; m10=t2[w], m2=t10[w]
+    VP9_RND_SH_SUMSUB_BA     5, 13,  0,  4,  8, [pd_8192]   ; m5=t3[w], m13=t11[w]
+
+    ; half of round 2 t8-15
+    VP9_UNPACK_MULSUB_2D_4X  2, 13,  4,  7,  9102, 13623    ; m2/x=t11[d], m13/x=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  9,  6,  3,  0, 13623,  9102    ; m9/x=t14[d], m6/x=t15[d]
+    VP9_RND_SH_SUMSUB_BA     9, 13,  3,  7,  8, [pd_8192]   ; m9=t10[w], m13=t14[w]
+    VP9_RND_SH_SUMSUB_BA     6,  2,  0,  4,  8, [pd_8192]   ; m6=t11[w], m2=t15[w]
+
+    SUMSUB_BA            w, 14, 10,  8                      ; m14=t2, m10=t6
+    SUMSUB_BA            w,  1,  5,  8                      ; m1=t3, m5=t7
+
+    mova                    m0, [blockq+  0]
+    mova                    m4, [blockq+128]
+    mova                    m3, [blockq+ 32]
+    mova                    m7, [blockq+ 64]
+    mova                    m8, [blockq+ 96]
+    mova          [blockq+  0], m1
+    mova          [blockq+128], m14
+    mova          [blockq+ 32], m6
+    mova          [blockq+ 64], m9
+    mova          [blockq+ 96], m10
+
+    ; second half of round 1
+    VP9_UNPACK_MULSUB_2D_4X 15,  0,  1,  9, 16364,   804    ; m15/x=t1[d], m0/x=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  7,  8, 10,  6, 11003, 12140    ; m7/x=t9[d], m8/x=t8[d]
+    VP9_RND_SH_SUMSUB_BA     8,  0,  6,  9, 14, [pd_8192]   ; m8=t0[w], m0=t8[w]
+    VP9_RND_SH_SUMSUB_BA     7, 15, 10,  1, 14, [pd_8192]   ; m7=t1[w], m15=t9[w]
+
+    VP9_UNPACK_MULSUB_2D_4X 11,  4, 10,  6, 14811,  7005    ; m11/x=t5[d], m4/x=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  3, 12,  1,  9,  5520, 15426    ; m3/x=t13[d], m12/x=t12[d]
+    VP9_RND_SH_SUMSUB_BA    12,  4,  9,  6, 14, [pd_8192]   ; m12=t4[w], m4=t12[w]
+    VP9_RND_SH_SUMSUB_BA     3, 11,  1, 10, 14, [pd_8192]   ; m3=t5[w], m11=t13[w]
+
+    ; second half of round 2 t8-15
+    VP9_UNPACK_MULSUB_2D_4X  0, 15,  6, 10, 16069,  3196    ; m15/x=t8[d], m0/x=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X 11,  4,  9,  1,  3196, 16069    ; m11/x=t12[d], m4/x=t13[d]
+    VP9_RND_SH_SUMSUB_BA    11, 15,  9, 10, 14, [pd_8192]   ; m11=t8[w], m15=t12[w]
+    VP9_RND_SH_SUMSUB_BA     4,  0,  1,  6, 14, [pd_8192]   ; m4=t9[w], m0=t13[w]
+
+    SUMSUB_BA            w, 12,  8, 14                      ; m12=t0, m8=t4
+    SUMSUB_BA            w,  3,  7, 14                      ; m3=t1, m7=t5
+
+    mova                   m10, [blockq+ 96]
+    mova          [blockq+ 96], m12
+
+    ; round 3
+    VP9_UNPACK_MULSUB_2D_4X 15,  0,  9, 12, 15137,  6270    ; m15/x=t13[d], m0/x=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X  2, 13,  1,  6,  6270, 15137    ; m2/x=t14[d], m13/x=t15[d]
+    VP9_RND_SH_SUMSUB_BA     2,  0,  1, 12, 14, [pd_8192]   ; m2=out2[w], m0=t14a[w]
+    VP9_RND_SH_SUMSUB_BA    13, 15,  6,  9, 14, [pd_8192]
+    PSIGNW                 m13, [pw_m1]                     ; m13=out13[w], m15=t15a[w]
+
+    VP9_UNPACK_MULSUB_2D_4X  8,  7, 12,  9, 15137,  6270    ; m8/x=t5[d], m7/x=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  5, 10,  1,  6,  6270, 15137    ; m5/x=t6[d], m10/x=t7[d]
+    VP9_RND_SH_SUMSUB_BA     5,  7,  1,  9, 14, [pd_8192]
+    PSIGNW                  m5, [pw_m1]                     ; m5=out3[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA    10,  8,  6, 12, 14, [pd_8192]   ; m10=out12[w], m8=t7[w]
+
+    mova                    m1, [blockq+  0]
+    mova                   m14, [blockq+128]
+    mova                    m6, [blockq+ 32]
+    mova                    m9, [blockq+ 64]
+    mova                   m12, [blockq+ 96]
+    mova          [blockq+  0], m10
+    mova          [blockq+128], m5
+
+    SUMSUB_BA            w, 14, 12,  5                      ; m14=out0, m12=t2a
+    SUMSUB_BA            w,  1,  3,  5
+    PSIGNW                  m1, [pw_m1]                     ; m1=out15, m3=t3a
+
+    SUMSUB_BA            w,  9, 11,  5
+    PSIGNW                  m9, [pw_m1]                     ; m9=out1, m11=t10
+    SUMSUB_BA            w,  6,  4,  5                      ; m6=out14, m4=t11
+
+    VP9_UNPACK_MULSUB_2W_4X  4, 11, 11585, 11585, [pd_8192],  5, 10 ; m4=out9, m11=out6
+    mova                    m5, [blockq+128]
+    mova          [blockq+192], m11
+    PSIGNW                 m15, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X 15,  0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
+
+    PSIGNW                  m3, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X  3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
+    VP9_UNPACK_MULSUB_2W_4X  8,  7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
+
+    mova                   m10, [blockq+  0]
+
+    SWAP                     0, 14,  6, 11,  8, 12, 10
+    SWAP                     1,  9, 15,  4,  7,  3,  5
+    SWAP                     5,  9, 15
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+%macro IADST16_YMM_FN 4
+INIT_YMM avx2
+cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
+    mova                m1, [blockq+ 32]
+    mova                m2, [blockq+ 64]
+    mova                m3, [blockq+ 96]
+    mova                m5, [blockq+160]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    mova                m8, [blockq+256]
+    mova                m9, [blockq+288]
+    mova               m10, [blockq+320]
+    mova               m11, [blockq+352]
+    mova               m12, [blockq+384]
+    mova               m13, [blockq+416]
+    mova               m14, [blockq+448]
+    mova               m15, [blockq+480]
+
+    VP9_%2_YMM_1D
+    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+                         [blockq+192], [blockq+128], 1
+    mova      [blockq+  0], m0
+    VP9_%4_YMM_1D
+
+    mova      [blockq+224], m7
+
+    ; store
+    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    pxor                m0, m0
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_YMM_FN idct,  IDCT16,  iadst, IADST16
+IADST16_YMM_FN iadst, IADST16, idct,  IDCT16
+IADST16_YMM_FN iadst, IADST16, iadst, IADST16
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%if %2 == 1
+%assign %%str mmsize
+%else
+%assign %%str 64
+%endif
+
+    ; first do t0-15, this can be done identical to idct16x16
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
+
+    ; store everything on stack to make space available for t16-31
+    ; we store interleaved with the output of the second half (t16-31)
+    ; so we don't need to allocate extra stack space
+    mova    [tmpq+ 0*%%str], m0     ; t0
+    mova    [tmpq+ 4*%%str], m1     ; t1
+    mova    [tmpq+ 8*%%str], m2     ; t2
+    mova    [tmpq+12*%%str], m3     ; t3
+    mova    [tmpq+16*%%str], m4     ; t4
+    mova    [tmpq+20*%%str], m5     ; t5
+%if ARCH_X86_64
+    mova    [tmpq+22*%%str], m10    ; t10
+    mova    [tmpq+18*%%str], m11    ; t11
+    mova    [tmpq+14*%%str], m12    ; t12
+    mova    [tmpq+10*%%str], m13    ; t13
+    mova    [tmpq+ 6*%%str], m14    ; t14
+    mova    [tmpq+ 2*%%str], m15    ; t15
+%endif
+
+    mova                m0, [tmpq+ 30*%%str]
+    UNSCRATCH            1,  6, tmpq+26*%%str
+    UNSCRATCH            2,  8, tmpq+24*%%str
+    UNSCRATCH            3,  9, tmpq+28*%%str
+    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
+    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
+
+    mova    [tmpq+24*%%str], m1     ; t6
+    mova    [tmpq+28*%%str], m0     ; t7
+    mova    [tmpq+30*%%str], m2     ; t8
+    mova    [tmpq+26*%%str], m3     ; t9
+
+    ; then, secondly, do t16-31
+%if %3 <= 8
+    mova                 m4, [%1+ 1*64]
+    mova                 m7, [%1+ 7*64]
+
+    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
+    pmulhrsw             m4, [pw_804x2] ;t16
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
+
+    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
+    pmulhrsw             m7, [pw_15426x2] ;t28
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
+%else
+    mova                 m0, [%1+ 1*64]
+    mova                 m1, [%1+15*64]
+%if %3 <= 16
+    pmulhrsw             m5, m0, [pw_16364x2]
+    pmulhrsw             m0, [pw_804x2]
+    pmulhrsw             m4, m1, [pw_m11003x2]
+    pmulhrsw             m1, [pw_12140x2]
+%else
+    mova                 m4, [%1+17*64]
+    mova                 m5, [%1+31*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+%endif
+    SUMSUB_BA             w,  4,  0,  2
+    SUMSUB_BA             w,  1,  5,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    mova                 m2, [%1+ 7*64]
+    mova                 m3, [%1+ 9*64]
+%if %3 <= 16
+    pmulhrsw             m7,  m3, [pw_14811x2]
+    pmulhrsw             m3, [pw_7005x2]
+    pmulhrsw             m6,  m2, [pw_m5520x2]
+    pmulhrsw             m2, [pw_15426x2]
+%else
+    mova                 m7, [%1+23*64]
+    mova                 m6, [%1+25*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
+%endif
+    SUMSUB_BA             w,  3,  6,  4
+    SUMSUB_BA             w,  7,  2,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%endif
+
+    UNSCRATCH             5, 12, tmpq+15*%%str
+    SUMSUB_BA             w,  6,  0,  4
+    mova    [tmpq+25*%%str], m6             ; t19
+    UNSCRATCH             4, 13, tmpq+ 1*%%str
+    SUMSUB_BA             w,  7,  1,  6
+    SUMSUB_BA             w,  3,  4,  6
+    mova    [tmpq+23*%%str], m3             ; t16
+    SUMSUB_BA             w,  2,  5,  6
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
+
+    SCRATCH               0, 10, tmpq+ 1*%%str
+    SCRATCH               1, 11, tmpq+ 7*%%str
+    SCRATCH               2,  9, tmpq+ 9*%%str
+    SCRATCH               4, 14, tmpq+15*%%str
+    SCRATCH               5, 15, tmpq+17*%%str
+    SCRATCH               7, 13, tmpq+31*%%str
+
+%if %3 <= 8
+    mova                 m0, [%1+ 5*64]
+    mova                 m3, [%1+ 3*64]
+
+    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
+    pmulhrsw             m0, [pw_3981x2] ;t20
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
+
+    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
+    pmulhrsw             m3, [pw_16207x2] ;t24
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%else
+    mova                 m4, [%1+ 5*64]
+    mova                 m5, [%1+11*64]
+%if %3 <= 16
+    pmulhrsw             m1, m4, [pw_15893x2]
+    pmulhrsw             m4, [pw_3981x2]
+    pmulhrsw             m0, m5, [pw_m8423x2]
+    pmulhrsw             m5, [pw_14053x2]
+%else
+    mova                 m0, [%1+21*64]
+    mova                 m1, [%1+27*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+    SUMSUB_BA             w,  0,  4,  2
+    SUMSUB_BA             w,  5,  1,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    mova                 m7, [%1+ 3*64]
+    mova                 m6, [%1+13*64]
+%if %3 <= 16
+    pmulhrsw             m3, m6, [pw_13160x2]
+    pmulhrsw             m6, [pw_9760x2]
+    pmulhrsw             m2, m7, [pw_m2404x2]
+    pmulhrsw             m7, [pw_16207x2]
+%else
+    mova                 m2, [%1+29*64]
+    mova                 m3, [%1+19*64]
+    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
+    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
+%endif
+    SUMSUB_BA             w,  6,  2,  4
+    SUMSUB_BA             w,  3,  7,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%endif
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+    UNSCRATCH             4, 12, tmpq+11*%%str
+    SUMSUB_BA             w,  0,  6, 5
+    SUMSUB_BA             w,  4,  2, 5
+    UNSCRATCH             5,  8, tmpq+ 5*%%str
+    SCRATCH               4,  8, tmpq+11*%%str
+    SUMSUB_BA             w,  1,  7, 4
+    SUMSUB_BA             w,  5,  3, 4
+    SCRATCH               5, 12, tmpq+ 5*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
+
+    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+    UNSCRATCH             5,  9, tmpq+ 9*%%str
+    mova                 m4, [tmpq+23*%%str] ; t16
+%if ARCH_X86_64
+    SUMSUB_BA             w,  1,  5,  9
+    SUMSUB_BA             w,  0,  4,  9
+%else
+    SUMSUB_BADC           w,  1,  5,  0,  4
+%endif
+    mova    [tmpq+29*%%str], m1     ; t17
+    mova    [tmpq+21*%%str], m0     ; t16
+    UNSCRATCH             0, 10, tmpq+ 1*%%str
+    UNSCRATCH             1, 11, tmpq+ 7*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  2,  0,  9
+    SUMSUB_BA             w,  3,  1,  9
+%else
+    SUMSUB_BADC           w,  2,  0,  3,  1
+%endif
+    mova    [tmpq+ 9*%%str], m2     ; t18
+    mova    [tmpq+13*%%str], m3     ; t19
+    SCRATCH               0, 10, tmpq+23*%%str
+    SCRATCH               1, 11, tmpq+27*%%str
+
+    UNSCRATCH             2, 14, tmpq+15*%%str
+    UNSCRATCH             3, 15, tmpq+17*%%str
+    SUMSUB_BA             w,  6,  2, 0
+    SUMSUB_BA             w,  7,  3, 0
+    SCRATCH               6, 14, tmpq+ 3*%%str
+    SCRATCH               7, 15, tmpq+ 7*%%str
+
+    UNSCRATCH             0,  8, tmpq+11*%%str
+    mova                 m1, [tmpq+25*%%str] ; t19
+    UNSCRATCH             6, 12, tmpq+ 5*%%str
+    UNSCRATCH             7, 13, tmpq+31*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  0,  1,  9
+    SUMSUB_BA             w,  6,  7,  9
+%else
+    SUMSUB_BADC           w,  0,  1,  6,  7
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+%if 0; cpuflag(ssse3)
+%if ARCH_X86_64
+    SUMSUB_BA             w,  4,  7,  8
+    SUMSUB_BA             w,  5,  1,  8
+%else
+    SUMSUB_BADC           w,  4,  7,  5,  1
+%endif
+
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m4, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+    pmulhrsw             m5, [pw_11585x2]
+
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+
+%if ARCH_X86_64
+    SUMSUB_BA             w,  7,  3, 10
+    SUMSUB_BA             w,  1,  2, 10
+%else
+    SUMSUB_BADC           w,  7,  3,  1,  2
+%endif
+
+    pmulhrsw             m3, [pw_11585x2]
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m2, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+%else
+    SCRATCH               0,  8, tmpq+15*%%str
+    SCRATCH               6,  9, tmpq+17*%%str
+    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
+    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
+    UNSCRATCH             0,  8, tmpq+15*%%str
+    UNSCRATCH             6,  9, tmpq+17*%%str
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+    ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+    mova    [tmpq+17*%%str], m2     ; t20
+    mova    [tmpq+ 1*%%str], m3     ; t21
+%if ARCH_X86_64
+    mova    [tmpq+25*%%str], m13    ; t22
+
+    mova                 m8, [tmpq+ 0*%%str] ; t0
+    mova                 m9, [tmpq+ 4*%%str] ; t1
+    mova                m12, [tmpq+ 8*%%str] ; t2
+    mova                m11, [tmpq+12*%%str] ; t3
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    mova                m13, [tmpq+24*%%str] ; t6
+
+    SUMSUB_BA             w,  6,  8, 10
+    mova    [tmpq+ 3*%%str], m8              ; t15
+    SUMSUB_BA             w,  0,  9,  8
+    SUMSUB_BA             w, 15, 12,  8
+    SUMSUB_BA             w, 14, 11,  8
+    SUMSUB_BA             w,  1,  2,  8
+    SUMSUB_BA             w,  7,  3,  8
+    SUMSUB_BA             w,  5, 13,  8
+    mova                m10, [tmpq+28*%%str] ; t7
+    SUMSUB_BA             w,  4, 10,  8
+%if cpuflag(avx2)
+    ; the "shitty" about this idct is that the final pass does the outermost
+    ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
+    ; to be sequential, which means I need to load/store half of the sumsub
+    ; intermediates back to/from memory to get a 16x16 transpose going...
+    ; This would be easier if we had more (e.g. 32) YMM regs here.
+    mova    [tmpq+ 7*%%str], m9
+    mova    [tmpq+11*%%str], m12
+    mova    [tmpq+15*%%str], m11
+    mova    [tmpq+19*%%str], m2
+    mova    [tmpq+23*%%str], m3
+    mova    [tmpq+27*%%str], m13
+    mova    [tmpq+31*%%str], m10
+    mova    [tmpq+12*%%str], m5
+
+    mova                m13, [tmpq+30*%%str] ; t8
+    mova                m12, [tmpq+26*%%str] ; t9
+    mova                m11, [tmpq+22*%%str] ; t10
+    mova                m10, [tmpq+18*%%str] ; t11
+    mova                 m9, [tmpq+17*%%str] ; t20
+    mova                 m8, [tmpq+ 1*%%str] ; t21
+    mova                 m3, [tmpq+25*%%str] ; t22
+    mova                 m2, [tmpq+ 5*%%str] ; t23
+
+    SUMSUB_BA             w,  9, 10, 5
+    SUMSUB_BA             w,  8, 11, 5
+    SUMSUB_BA             w,  3, 12, 5
+    SUMSUB_BA             w,  2, 13, 5
+    mova    [tmpq+ 1*%%str], m10
+    mova    [tmpq+ 5*%%str], m11
+    mova    [tmpq+17*%%str], m12
+    mova    [tmpq+25*%%str], m13
+
+    mova                m13, [tmpq+14*%%str] ; t12
+    mova                m12, [tmpq+10*%%str] ; t13
+    mova                m11, [tmpq+ 9*%%str] ; t18
+    mova                m10, [tmpq+13*%%str] ; t19
+
+    SUMSUB_BA             w, 11, 12, 5
+    SUMSUB_BA             w, 10, 13, 5
+    mova    [tmpq+ 9*%%str], m13
+    mova    [tmpq+13*%%str], m12
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+
+    mova                m13, [tmpq+ 6*%%str] ; t14
+    mova                m12, [tmpq+ 2*%%str] ; t15
+    mova                m11, [tmpq+21*%%str] ; t16
+    mova                m10, [tmpq+29*%%str] ; t17
+    SUMSUB_BA             w, 11, 12, 5
+    SUMSUB_BA             w, 10, 13, 5
+    mova    [tmpq+21*%%str], m12
+    mova    [tmpq+29*%%str], m13
+    mova                m12, [tmpq+10*%%str]
+    mova                m13, [tmpq+14*%%str]
+
+    TRANSPOSE16x16W       6,  0, 15, 14,  1,  7,  5,  4, \
+                          2,  3,  8,  9, 12, 13, 10, 11, \
+            [tmpq+12*%%str], [tmpq+ 8*%%str], 1
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+ 4*%%str], m15
+    mova    [tmpq+ 6*%%str], m14
+    mova    [tmpq+10*%%str], m7
+    mova    [tmpq+12*%%str], m5
+    mova    [tmpq+14*%%str], m4
+    mova    [tmpq+16*%%str], m2
+    mova    [tmpq+18*%%str], m3
+    mova    [tmpq+20*%%str], m8
+    mova    [tmpq+22*%%str], m9
+    mova    [tmpq+24*%%str], m12
+    mova    [tmpq+26*%%str], m13
+    mova    [tmpq+28*%%str], m10
+    mova    [tmpq+30*%%str], m11
+
+    mova                 m0, [tmpq+21*%%str]
+    mova                 m1, [tmpq+29*%%str]
+    mova                 m2, [tmpq+13*%%str]
+    mova                 m3, [tmpq+ 9*%%str]
+    mova                 m4, [tmpq+ 1*%%str]
+    mova                 m5, [tmpq+ 5*%%str]
+    mova                 m7, [tmpq+25*%%str]
+    mova                 m8, [tmpq+31*%%str]
+    mova                 m9, [tmpq+27*%%str]
+    mova                m10, [tmpq+23*%%str]
+    mova                m11, [tmpq+19*%%str]
+    mova                m12, [tmpq+15*%%str]
+    mova                m13, [tmpq+11*%%str]
+    mova                m14, [tmpq+ 7*%%str]
+    mova                m15, [tmpq+ 3*%%str]
+    TRANSPOSE16x16W       0,  1,  2,  3,  4,  5,  6,  7, \
+                          8,  9, 10, 11, 12, 13, 14, 15, \
+            [tmpq+17*%%str], [tmpq+ 9*%%str], 1
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 3*%%str], m1
+    mova    [tmpq+ 5*%%str], m2
+    mova    [tmpq+ 7*%%str], m3
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+13*%%str], m6
+    mova    [tmpq+15*%%str], m7
+    mova    [tmpq+17*%%str], m8
+    mova    [tmpq+19*%%str], m9
+    mova    [tmpq+21*%%str], m10
+    mova    [tmpq+23*%%str], m11
+    mova    [tmpq+25*%%str], m12
+    mova    [tmpq+27*%%str], m13
+    mova    [tmpq+29*%%str], m14
+    mova    [tmpq+31*%%str], m15
+%else ; !avx2
+    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m15
+    mova    [tmpq+12*%%str], m14
+    mova    [tmpq+16*%%str], m1
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+28*%%str], m4
+
+    mova                  m8, [tmpq+ 3*%%str] ; t15
+    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
+    mova    [tmpq+ 3*%%str], m10
+    mova    [tmpq+ 7*%%str], m13
+    mova    [tmpq+11*%%str], m3
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+19*%%str], m11
+    mova    [tmpq+23*%%str], m12
+    mova    [tmpq+27*%%str], m9
+    mova    [tmpq+31*%%str], m8
+
+    mova                m15, [tmpq+30*%%str] ; t8
+    mova                m14, [tmpq+26*%%str] ; t9
+    mova                m13, [tmpq+22*%%str] ; t10
+    mova                m12, [tmpq+18*%%str] ; t11
+    mova                m11, [tmpq+14*%%str] ; t12
+    mova                m10, [tmpq+10*%%str] ; t13
+    mova                 m9, [tmpq+ 6*%%str] ; t14
+    mova                 m8, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    mova                 m1, [tmpq+25*%%str] ; t22
+
+    SUMSUB_BA             w,  7,  8, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BA             w,  6,  9, 8
+    SUMSUB_BA             w,  5, 10, 8
+    SUMSUB_BA             w,  4, 11, 8
+    SUMSUB_BA             w,  3, 12, 8
+    SUMSUB_BA             w,  2, 13, 8
+    SUMSUB_BA             w,  1, 14, 8
+    SUMSUB_BA             w,  0, 15, 8
+
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+17*%%str], m4
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m8, [tmpq+ 2*%%str]
+    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova    [tmpq+ 6*%%str], m9
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+    mova    [tmpq+18*%%str], m12
+    mova    [tmpq+22*%%str], m13
+    mova    [tmpq+26*%%str], m14
+    mova    [tmpq+30*%%str], m15
+%endif ; avx2
+%else
+    mova                 m2, [tmpq+24*%%str] ; t6
+    mova                 m3, [tmpq+28*%%str] ; t7
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+23*%%str], m2
+    mova    [tmpq+28*%%str], m4
+    mova    [tmpq+19*%%str], m3
+
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    SUMSUB_BA             w,  1,  2,  5
+    SUMSUB_BA             w,  7,  3,  5
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+11*%%str], m3
+
+    mova                 m2, [tmpq+ 0*%%str] ; t0
+    mova                 m3, [tmpq+ 4*%%str] ; t1
+    SUMSUB_BA             w,  6,  2,  5
+    SUMSUB_BA             w,  0,  3,  5
+    mova    [tmpq+31*%%str], m2
+    mova    [tmpq+27*%%str], m3
+
+    mova                 m2, [tmpq+ 8*%%str] ; t2
+    mova                 m3, [tmpq+12*%%str] ; t3
+    mova                 m5, [tmpq+ 7*%%str]
+    mova                 m4, [tmpq+ 3*%%str]
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+ 7*%%str], m2
+    mova    [tmpq+ 3*%%str], m3
+
+    mova                 m3, [tmpq+28*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m5
+    mova    [tmpq+12*%%str], m4
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m2
+    mova    [tmpq+28*%%str], m3
+
+    mova                 m6, [tmpq+19*%%str]
+    mova                 m0, [tmpq+23*%%str]
+    mova                 m5, [tmpq+11*%%str]
+    mova                 m4, [tmpq+15*%%str]
+    mova                 m1, [tmpq+ 3*%%str]
+    mova                 m7, [tmpq+ 7*%%str]
+    mova                 m3, [tmpq+31*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
+    mova    [tmpq+ 3*%%str], m6
+    mova    [tmpq+ 7*%%str], m0
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+15*%%str], m4
+    mova    [tmpq+23*%%str], m7
+    mova    [tmpq+27*%%str], m2
+    mova    [tmpq+31*%%str], m3
+
+    mova                 m1, [tmpq+ 6*%%str] ; t14
+    mova                 m0, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    SUMSUB_BA             w,  7,  0,  2
+    SUMSUB_BA             w,  6,  1,  2
+    mova    [tmpq+29*%%str], m7
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+21*%%str], m6
+    mova    [tmpq+ 6*%%str], m1
+
+    mova                 m1, [tmpq+14*%%str] ; t12
+    mova                 m0, [tmpq+10*%%str] ; t13
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    SUMSUB_BA             w,  5,  0,  2
+    SUMSUB_BA             w,  4,  1,  2
+    mova     [tmpq+10*%%str], m0
+    mova     [tmpq+14*%%str], m1
+
+    mova                 m1, [tmpq+22*%%str] ; t10
+    mova                 m0, [tmpq+18*%%str] ; t11
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    SUMSUB_BA             w,  3,  0,  6
+    SUMSUB_BA             w,  2,  1,  6
+    mova     [tmpq+18*%%str], m0
+    mova     [tmpq+22*%%str], m1
+
+    mova                 m7, [tmpq+30*%%str] ; t8
+    mova                 m6, [tmpq+26*%%str] ; t9
+    mova                 m1, [tmpq+25*%%str] ; t22
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BADC           w,  1,  6,  0,  7
+    mova     [tmpq+26*%%str], m6
+    mova     [tmpq+30*%%str], m7
+
+    mova                 m7, [tmpq+29*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m0, [tmpq+ 2*%%str]
+    mova                 m1, [tmpq+ 6*%%str]
+    mova                 m2, [tmpq+10*%%str]
+    mova                 m3, [tmpq+14*%%str]
+    mova                 m4, [tmpq+18*%%str]
+    mova                 m5, [tmpq+22*%%str]
+    mova                 m7, [tmpq+30*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+ 6*%%str], m1
+    mova    [tmpq+10*%%str], m2
+    mova    [tmpq+14*%%str], m3
+    mova    [tmpq+22*%%str], m5
+    mova    [tmpq+26*%%str], m6
+    mova    [tmpq+30*%%str], m7
+%endif
+%else
+    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+    ; t20-22 is in m4-6
+    ; t24-31 is in m8-15
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+    SUMSUB_BA            w, %4, %1, %5
+    SUMSUB_BA            w, %3, %2, %5
+    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
+%if %8 == 1
+    add               dstq, stride2q
+%endif
+    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
+%if %8 == 1
+    sub           dst_endq, stride2q
+%endif
+%endmacro
+
+%if ARCH_X86_64
+    pxor               m10, m10
+
+    ; store t0-1 and t30-31
+    mova                m8, [tmpq+ 0*%%str]
+    mova                m9, [tmpq+ 4*%%str]
+    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
+
+    ; store t2-3 and t28-29
+    mova                m8, [tmpq+ 8*%%str]
+    mova                m9, [tmpq+12*%%str]
+    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
+
+    ; store t4-5 and t26-27
+    mova                m8, [tmpq+16*%%str]
+    mova                m9, [tmpq+20*%%str]
+    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
+
+    ; store t6-7 and t24-25
+    mova                m8, [tmpq+24*%%str]
+    mova                m9, [tmpq+28*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t8-9 and t22-23
+    mova                m8, [tmpq+30*%%str]
+    mova                m9, [tmpq+26*%%str]
+    mova                m0, [tmpq+ 5*%%str]
+    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
+
+    ; store t10-11 and t20-21
+    mova                m8, [tmpq+22*%%str]
+    mova                m9, [tmpq+18*%%str]
+    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
+
+    ; store t12-13 and t18-19
+    mova                m8, [tmpq+14*%%str]
+    mova                m9, [tmpq+10*%%str]
+    mova                m5, [tmpq+13*%%str]
+    mova                m4, [tmpq+ 9*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t14-17
+    mova                m8, [tmpq+ 6*%%str]
+    mova                m9, [tmpq+ 2*%%str]
+    mova                m5, [tmpq+29*%%str]
+    mova                m4, [tmpq+21*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
+
+    SWAP                 1, 10 ; zero
+%else
+    mova   [tmpq+ 1*%%str], m1
+    mova   [tmpq+11*%%str], m2
+    mova   [tmpq+15*%%str], m3
+    mova   [tmpq+17*%%str], m4
+    mova   [tmpq+19*%%str], m5
+    pxor                m1, m1
+
+    ; store t0-1 and t30-31
+    mova                m2, [tmpq+ 0*%%str]
+    mova                m3, [tmpq+ 4*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t2-3 and t28-29
+    mova                m2, [tmpq+ 8*%%str]
+    mova                m3, [tmpq+12*%%str]
+    mova                m0, [tmpq+ 3*%%str]
+    mova                m6, [tmpq+ 7*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t4-5 and t26-27
+    mova                m2, [tmpq+16*%%str]
+    mova                m3, [tmpq+20*%%str]
+    mova                m0, [tmpq+ 1*%%str]
+    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
+
+    ; store t6-7 and t24-25
+    mova                m2, [tmpq+24*%%str]
+    mova                m3, [tmpq+28*%%str]
+    mova                m0, [tmpq+17*%%str]
+    mova                m6, [tmpq+19*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t8-9 and t22-23
+    mova                m2, [tmpq+30*%%str]
+    mova                m3, [tmpq+26*%%str]
+    mova                m0, [tmpq+25*%%str]
+    mova                m6, [tmpq+ 5*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t10-11 and t20-21
+    mova                m2, [tmpq+22*%%str]
+    mova                m3, [tmpq+18*%%str]
+    mova                m0, [tmpq+11*%%str]
+    mova                m6, [tmpq+15*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t12-13 and t18-19
+    mova                m2, [tmpq+14*%%str]
+    mova                m3, [tmpq+10*%%str]
+    mova                m6, [tmpq+13*%%str]
+    mova                m0, [tmpq+ 9*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t14-17
+    mova                m2, [tmpq+ 6*%%str]
+    mova                m3, [tmpq+ 2*%%str]
+    mova                m6, [tmpq+29*%%str]
+    mova                m0, [tmpq+21*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
+%endif
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+    movifnidn         eobd, dword eobm
+%if cpuflag(ssse3)
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 34
+    jg .idct16x16
+    cmp eobd, 1
+    jg .idct8x8
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+    ; dc-only case
+    movifnidn       blockq, blockmp
+    movifnidn         dstq, dstmp
+    movifnidn      strideq, stridemp
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 31
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    add               dstq, strideq
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    RET
+
+%if ARCH_X86_64
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+%else
+%define dst_bakq r0mp
+%endif
+%if cpuflag(ssse3)
+.idct8x8:
+%if ARCH_X86_32
+    DEFINE_ARGS block, u1, u2, u3, u4, tmp
+    mov             blockq, r2mp
+%endif
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 8
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    sub          stride30q, stride2q        ; stride*30
+.loop2_8x8:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 8
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64,  8, m1
+    RET
+
+.idct16x16:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_16x16:
+    VP9_IDCT32_1D   blockq, 1, 16
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_16x16
+
+%if ARCH_X86_64
+    sub             blockq, 32
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 16, m1
+    RET
+%endif
+
+.idctfull:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 4
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_full
+
+%if ARCH_X86_64
+    sub             blockq, 64
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 32, m1
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM sse2
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+    mova               m%2, [dstq]
+    mova               m%4, [dstq+strideq]
+    punpckhbw          m%3, m%2, m%6
+    punpcklbw          m%2, m%6
+    punpckhbw          m%5, m%4, m%6
+    punpcklbw          m%4, m%6
+    paddw              m%3, m%1
+    paddw              m%2, m%1
+    paddw              m%5, m%1
+    paddw              m%4, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova  [dstq+strideq*0], m%2
+    mova  [dstq+strideq*1], m%4
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 1
+    jg .idct16x16
+
+    ; dc-only case
+    mova                m1, [pw_11585x2]
+    vpbroadcastw        m0, [blockq]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+    pxor                m5, m5
+    pmulhrsw            m0, [pw_512]
+    movd          [blockq], xm5
+
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 16
+.loop_dc:
+    VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+.idct16x16:
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 16
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 2
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 16
+    add               tmpq, 32
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m1 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64, 16, m1
+    RET
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 32
+    add               tmpq, 1024
+    dec               cntd
+    jg .loop1_full
+
+    sub             blockq, 64
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 2
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 16
+    add               tmpq, 32
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m1 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64, 32, m1
+    RET
+%endif
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
new file mode 100644
index 0000000000..902685edf6
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 inverse transform x86 SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA
+
+cextern pw_8
+cextern pw_1023
+cextern pw_2048
+cextern pw_4095
+cextern pw_m1
+cextern pd_1
+cextern pd_16
+cextern pd_32
+cextern pd_8192
+
+pd_8: times 4 dd 8
+pd_3fff: times 4 dd 0x3fff
+
+cextern pw_11585x2
+
+cextern pw_5283_13377
+cextern pw_9929_13377
+cextern pw_15212_m13377
+cextern pw_15212_9929
+cextern pw_m5283_m15212
+cextern pw_13377x2
+cextern pw_m13377_13377
+cextern pw_13377_0
+
+pw_9929_m5283: times 4 dw 9929, -5283
+
+%macro COEF_PAIR 2-3
+cextern pw_m%1_%2
+cextern pw_%2_%1
+%if %0 == 3
+cextern pw_m%1_m%2
+%if %1 != %2
+cextern pw_m%2_%1
+cextern pw_%1_%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR  2404, 16207
+COEF_PAIR  3196, 16069, 1
+COEF_PAIR  4756, 15679
+COEF_PAIR  5520, 15426
+COEF_PAIR  6270, 15137, 1
+COEF_PAIR  8423, 14053
+COEF_PAIR 10394, 12665
+COEF_PAIR 11003, 12140
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 13160,  9760
+COEF_PAIR 13623,  9102, 1
+COEF_PAIR 14449,  7723
+COEF_PAIR 14811,  7005
+COEF_PAIR 15893,  3981
+COEF_PAIR 16305,  1606
+COEF_PAIR 16364,   804
+
+default_8x8:
+times 12 db 1
+times 52 db 2
+row_8x8:
+times 18 db 1
+times 46 db 2
+col_8x8:
+times 6 db 1
+times 58 db 2
+default_16x16:
+times 10 db 1
+times 28 db 2
+times 51 db 3
+times 167 db 4
+row_16x16:
+times 21 db 1
+times 45 db 2
+times 60 db 3
+times 130 db 4
+col_16x16:
+times 5 db 1
+times 12 db 2
+times 25 db 3
+times 214 db 4
+default_32x32:
+times 9 db 1
+times 25 db 2
+times 36 db 3
+times 65 db 4
+times 105 db 5
+times 96 db 6
+times 112 db 7
+times 576 db 8
+
+SECTION .text
+
+%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
+    mova               m%3, [%7]
+    mova               m%4, [%7+strideq]
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+    pmaxsw             m%3, m%5
+    pmaxsw             m%4, m%5
+    pminsw             m%3, m%6
+    pminsw             m%4, m%6
+    mova              [%7], m%3
+    mova      [%7+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*4/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+; the input coefficients are scaled up by 2 bit (which we downscale immediately
+; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
+; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
+; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
+; add 2 bits, we need to scale before converting to word in 12bpp, since the
+; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
+; we can scale after converting to words (which is half the instructions),
+; since the input is only 14+sign bit, which fits in 15+sign words directly.
+
+%macro IWHT4_FN 2 ; bpp, max
+cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
+    mova                m7, [pw_%2]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+%if %1 >= 12
+    mova                m4, [blockq+0*16+8]
+    mova                m5, [blockq+1*16+8]
+    psrad               m0, 2
+    psrad               m1, 2
+    psrad               m4, 2
+    psrad               m5, 2
+    packssdw            m0, m4
+    packssdw            m1, m5
+%else
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    psraw               m0, 2
+    psraw               m1, 2
+%endif
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+%if %1 >= 12
+    mova                m4, [blockq+2*16+8]
+    mova                m5, [blockq+3*16+8]
+    psrad               m2, 2
+    psrad               m3, 2
+    psrad               m4, 2
+    psrad               m5, 2
+    packssdw            m2, m4
+    packssdw            m3, m5
+%else
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+    psraw               m2, 2
+    psraw               m3, 2
+%endif
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m6, m6
+    VP9_STORE_2X         0, 1, 4, 5, 6, 7
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 4, 5, 6, 7
+    ZERO_BLOCK      blockq, 16, 4, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+IWHT4_FN 10, 1023
+INIT_MMX mmxext
+IWHT4_FN 12, 4095
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5
+    pmulhrsw            m1, m5
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m0, 4
+    psraw               m1, 4
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    mova                m5, [pw_1023]
+    VP9_STORE_2X         0,  1,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         2,  3,  6,  7,  4,  5
+%endmacro
+
+%macro DC_ONLY 2 ; shift, zero
+    mov              coefd, dword [blockq]
+    movd          [blockq], %2
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, ((1 << (%1 - 1)) << 14) + 8192
+    sar              coefd, 14 + %1
+%endmacro
+
+; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
+; in 15+1 words without additional effort, since the coefficients are 15bpp.
+
+%macro IDCT4_10_FN 0
+cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only
+    pxor                m4, m4
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    movd          [blockq], m4
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    DC_ONLY              4, m4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    mova                m5, [pw_1023]
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4,  5
+    RET
+
+.idctfull:
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX mmxext
+IDCT4_10_FN
+INIT_MMX ssse3
+IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
+; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
+%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
+    pand               m%3, m%1, %8
+    pand               m%4, m%2, %8
+    psrad              m%1, 14
+    psrad              m%2, 14
+    packssdw           m%4, m%2
+    packssdw           m%3, m%1
+    punpckhwd          m%2, m%4, m%3
+    punpcklwd          m%4, m%3
+    pmaddwd            m%3, m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_%6_%5]
+    pmaddwd            m%4, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_m%5_%6]
+    paddd              m%3, %7
+    paddd              m%4, %7
+    psrad              m%3, 14
+    psrad              m%4, 14
+    paddd              m%1, m%3
+    paddd              m%2, m%4
+%endmacro
+
+%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
+    SUMSUB_MUL          %3, %5, %7, %8, 11585, 11585, %1, %2
+    SUMSUB_MUL          %4, %6, %7, %8, 15137,  6270, %1, %2
+    SUMSUB_BA        d, %4, %3, %7
+    SUMSUB_BA        d, %6, %5, %7
+    SWAP                %4, %6, %3
+%endmacro
+
+%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
+    movh               m%1, [dstq+strideq*0]
+    movh               m%2, [dstq+strideq*2]
+    movhps             m%1, [dstq+strideq*1]
+    movhps             m%2, [dstq+stride3q ]
+    paddw              m%1, m%3
+    paddw              m%2, m%4
+    pmaxsw             m%1, %5
+    pmaxsw             m%2, %5
+    pminsw             m%1, %6
+    pminsw             m%2, %6
+    movh   [dstq+strideq*0], m%1
+    movhps [dstq+strideq*1], m%1
+    movh   [dstq+strideq*2], m%2
+    movhps [dstq+stride3q ], m%2
+%endmacro
+
+%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
+    paddd              m%1, %7
+    paddd              m%2, %7
+    paddd              m%3, %7
+    paddd              m%4, %7
+    psrad              m%1, %8
+    psrad              m%2, %8
+    psrad              m%3, %8
+    psrad              m%4, %8
+    packssdw           m%1, m%2
+    packssdw           m%3, m%4
+    STORE_4x4           %2, %4, %1, %3, %5, %6
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
+    ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
+    ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
+    ; dword. After the final shift (4), the result is 13+sign bits, so we
+    ; don't need any additional processing to fit it in a word
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m4, m4
+    DC_ONLY              4, m4
+    movd                m0, coefd
+    pshuflw             m0, m0, q0000
+    punpcklqdq          m0, m0
+    mova                m5, [pw_4095]
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    STORE_4x4            1, 3, 0, 0, m4, m5
+    RET
+
+.idctfull:
+    DEFINE_ARGS dst, stride, block, eob
+    mova                m0, [blockq+0*16]
+    mova                m1, [blockq+1*16]
+    mova                m2, [blockq+2*16]
+    mova                m3, [blockq+3*16]
+    mova                m6, [pd_8192]
+    mova                m7, [pd_3fff]
+
+    IDCT4_12BPP_1D      m6, m7
+    TRANSPOSE4x4D        0, 1, 2, 3, 4
+    IDCT4_12BPP_1D      m6, m7
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+
+    ; writeout
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    mova                m5, [pw_4095]
+    mova                m6, [pd_8]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
+    RET
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; out0 =  5283 * in0 + 13377 + in1 + 15212 * in2 +  9929 * in3 + rnd >> 14
+; out1 =  9929 * in0 + 13377 * in1 -  5283 * in2 - 15282 * in3 + rnd >> 14
+; out2 = 13377 * in0               - 13377 * in2 + 13377 * in3 + rnd >> 14
+; out3 = 15212 * in0 - 13377 * in1 +  9929 * in2 -  5283 * in3 + rnd >> 14
+%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
+    pand                m4, m0, %2
+    pand                m5, m1, %2
+    psrad               m0, 14
+    psrad               m1, 14
+    packssdw            m5, m1
+    packssdw            m4, m0
+    punpckhwd           m1, m4, m5
+    punpcklwd           m4, m5
+    pand                m5, m2, %2
+    pand                m6, m3, %2
+    psrad               m2, 14
+    psrad               m3, 14
+    packssdw            m6, m3
+    packssdw            m5, m2
+    punpckhwd           m3, m5, m6
+    punpcklwd           m5, m6
+    SCRATCH              1,  8, rsp+0*mmsize, a
+    SCRATCH              5,  9, rsp+1*mmsize, b
+
+    ; m1/3 have the high bits of 0,1,2,3
+    ; m4/5 have the low bits of 0,1,2,3
+    ; m0/2/6/7 are free
+
+    mova                m2, [pw_15212_9929]
+    mova                m0, [pw_5283_13377]
+    pmaddwd             m7, m2, reg_b
+    pmaddwd             m6, m4, m0
+    pmaddwd             m2, m3
+    pmaddwd             m0, reg_a
+    paddd               m6, m7
+    paddd               m0, m2
+    mova                m1, [pw_m13377_13377]
+    mova                m5, [pw_13377_0]
+    pmaddwd             m7, m1, reg_b
+    pmaddwd             m2, m4, m5
+    pmaddwd             m1, m3
+    pmaddwd             m5, reg_a
+    paddd               m2, m7
+    paddd               m1, m5
+    paddd               m6, %1
+    paddd               m2, %1
+    psrad               m6, 14
+    psrad               m2, 14
+    paddd               m0, m6                      ; t0
+    paddd               m2, m1                      ; t2
+
+    mova                m7, [pw_m5283_m15212]
+    mova                m5, [pw_9929_13377]
+    pmaddwd             m1, m7, reg_b
+    pmaddwd             m6, m4, m5
+    pmaddwd             m7, m3
+    pmaddwd             m5, reg_a
+    paddd               m6, m1
+    paddd               m7, m5
+    UNSCRATCH            5,  9, rsp+1*mmsize, b
+    pmaddwd             m5, [pw_9929_m5283]
+    pmaddwd             m4, [pw_15212_m13377]
+    pmaddwd             m3, [pw_9929_m5283]
+    UNSCRATCH            1,  8, rsp+0*mmsize, a
+    pmaddwd             m1, [pw_15212_m13377]
+    paddd               m4, m5
+    paddd               m3, m1
+    paddd               m6, %1
+    paddd               m4, %1
+    psrad               m6, 14
+    psrad               m4, 14
+    paddd               m7, m6                      ; t1
+    paddd               m3, m4                      ; t3
+
+    SWAP                 1, 7
+%endmacro
+
+%macro IADST4_12BPP_FN 4
+cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
+    mova                m0, [blockq+0*16]
+    mova                m1, [blockq+1*16]
+    mova                m2, [blockq+2*16]
+    mova                m3, [blockq+3*16]
+
+    PRELOAD             10, pd_8192, rnd
+    PRELOAD             11, pd_3fff, mask
+    %2_12BPP_1D    reg_rnd, reg_mask
+    TRANSPOSE4x4D        0, 1, 2, 3, 4
+    %4_12BPP_1D    reg_rnd, reg_mask
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+
+    ; writeout
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    mova                m5, [pw_4095]
+    mova                m6, [pd_8]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
+    RET
+%endmacro
+
+INIT_XMM sse2
+IADST4_12BPP_FN idct,  IDCT4,  iadst, IADST4
+IADST4_12BPP_FN iadst, IADST4, idct,  IDCT4
+IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH            6, 8, rsp+%3*mmsize
+%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
+    mova                m0, [%1+0*%4]
+    mova                m2, [%1+2*%4]
+    mova                m4, [%1+4*%4]
+    mova                m6, [%1+6*%4]
+    IDCT4_12BPP_1D      %2, %3, 0, 2, 4, 6, 1, 3            ; m0/2/4/6 have t0/1/2/3
+    SCRATCH              4, 8, rsp+(%5+0)*mmsize
+    SCRATCH              6, 9, rsp+(%5+1)*mmsize
+    mova                m1, [%1+1*%4]
+    mova                m3, [%1+3*%4]
+    mova                m5, [%1+5*%4]
+    mova                m7, [%1+7*%4]
+    SUMSUB_MUL           1, 7, 4, 6, 16069,  3196, %2, %3   ; m1=t7a, m7=t4a
+    SUMSUB_MUL           5, 3, 4, 6,  9102, 13623, %2, %3   ; m5=t6a, m3=t5a
+    SUMSUB_BA         d, 3, 7, 4                            ; m3=t4, m7=t5a
+    SUMSUB_BA         d, 5, 1, 4                            ; m5=t7, m1=t6a
+    SUMSUB_MUL           1, 7, 4, 6, 11585, 11585, %2, %3   ; m1=t6, m7=t5
+    SUMSUB_BA         d, 5, 0, 4                            ; m5=out0, m0=out7
+    SUMSUB_BA         d, 1, 2, 4                            ; m1=out1, m2=out6
+    UNSCRATCH            4, 8, rsp+(%5+0)*mmsize
+    UNSCRATCH            6, 9, rsp+(%5+1)*mmsize
+    SCRATCH              2, 8, rsp+(%5+0)*mmsize
+    SUMSUB_BA         d, 7, 4, 2                            ; m7=out2, m4=out5
+    SUMSUB_BA         d, 3, 6, 2                            ; m3=out3, m6=out4
+    SWAP                 0, 5, 4, 6, 2, 7
+%endmacro
+
+%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
+    mova               m%1, [%6+%7*0]
+    mova               m%2, [%6+%7*1]
+    paddw              m%1, m%3
+    paddw              m%2, m%3
+    pmaxsw             m%1, %4
+    pmaxsw             m%2, %4
+    pminsw             m%1, %5
+    pminsw             m%2, %5
+    mova         [%6+%7*0], m%1
+    mova         [%6+%7*1], m%2
+%endmacro
+
+; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
+; storage also instead of allocating two more stack spaces. This doesn't
+; matter much but it's something...
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
+                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+                                  dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              5, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 4
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    SCRATCH              0, 12, rsp+16*mmsize, max
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_8x8]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_8x8+cntq-1]
+%endif
+    mov              skipd, 2
+    sub              skipd, cntd
+    mov               ptrq, rsp
+    PRELOAD             10, pd_8192, rnd
+    PRELOAD             11, pd_3fff, mask
+    PRELOAD             13, pd_16, srnd
+.loop_1:
+    IDCT8_1D        blockq, reg_rnd, reg_mask
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 6
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 2*mmsize], m1
+    mova  [ptrq+ 4*mmsize], m2
+    mova  [ptrq+ 6*mmsize], m3
+    UNSCRATCH            6, 8, rsp+17*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 1*mmsize], m4
+    mova  [ptrq+ 3*mmsize], m5
+    mova  [ptrq+ 5*mmsize], m6
+    mova  [ptrq+ 7*mmsize], m7
+    add               ptrq, 8 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    add               ptrq, 4 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 2
+    mov               ptrq, rsp
+.loop_2:
+    IDCT8_1D          ptrq, reg_rnd, reg_mask
+
+    pxor                m6, m6
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+    lea               dstq, [dstq+strideq*4]
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    UNSCRATCH            1, 12, rsp+16*mmsize, max
+    UNSCRATCH            2, 13, pd_16, srnd
+    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
+    add               ptrq, 16
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+8]
+%else
+    mov               dstq, dstm
+    add               dstq, 8
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m6 is still zero
+    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+    RET
+
+%macro DC_ONLY_64BIT 2 ; shift, zero
+%if ARCH_X86_64
+    movsxd           coefq, dword [blockq]
+    movd          [blockq], %2
+    imul             coefq, 11585
+    add              coefq, 8192
+    sar              coefq, 14
+    imul             coefq, 11585
+    add              coefq, ((1 << (%1 - 1)) << 14) + 8192
+    sar              coefq, 14 + %1
+%else
+    mov              coefd, dword [blockq]
+    movd          [blockq], %2
+    DEFINE_ARGS dst, stride, cnt, coef, coefl
+    mov               cntd, 2
+.loop_dc_calc:
+    mov             coefld, coefd
+    sar              coefd, 14
+    and             coefld, 0x3fff
+    imul             coefd, 11585
+    imul            coefld, 11585
+    add             coefld, 8192
+    sar             coefld, 14
+    add              coefd, coefld
+    dec               cntd
+    jg .loop_dc_calc
+    add              coefd, 1 << (%1 - 1)
+    sar              coefd, %1
+%endif
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
+                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+                                  dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
+    ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        5, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 4
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
+; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
+%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
+    pand               m%3, m%1, %7
+    pand               m%4, m%2, %7
+    psrad              m%1, 14
+    psrad              m%2, 14
+    packssdw           m%4, m%2
+    packssdw           m%3, m%1
+    punpckhwd          m%2, m%4, m%3
+    punpcklwd          m%4, m%3
+    pmaddwd            m%3, m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_%6_%5]
+    pmaddwd            m%4, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_m%5_%6]
+%endmacro
+
+; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
+; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
+%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
+    SUMSUB_BA        d, %1, %2, %5
+    SUMSUB_BA        d, %3, %4, %5
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%3, 14
+    psrad              m%4, 14
+    paddd              m%1, m%3
+    paddd              m%2, m%4
+%endmacro
+
+%macro NEGD 1
+%if cpuflag(ssse3)
+    psignd              %1, [pw_m1]
+%else
+    pxor                %1, [pw_m1]
+    paddd               %1, [pd_1]
+%endif
+%endmacro
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH            6, 8, rsp+17*mmsize
+%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
+    mova                m0, [%1+ 0*mmsize]
+    mova                m3, [%1+ 6*mmsize]
+    mova                m4, [%1+ 8*mmsize]
+    mova                m7, [%1+14*mmsize]
+    SUMSUB_MUL_D         7, 0, 1, 2, 16305,  1606, %3   ; m7/1=t0a, m0/2=t1a
+    SUMSUB_MUL_D         3, 4, 5, 6, 10394, 12665, %3   ; m3/5=t4a, m4/6=t5a
+    SCRATCH              0, 8, rsp+17*mmsize
+    SUMSUB_PACK_D        3, 7, 5, 1, 0, %2              ; m3=t0, m7=t4
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    SUMSUB_PACK_D        4, 0, 6, 2, 1, %2              ; m4=t1, m0=t5
+
+    SCRATCH              3, 8, rsp+17*mmsize
+    SCRATCH              4, 9, rsp+18*mmsize
+    SCRATCH              7, 10, rsp+19*mmsize
+    SCRATCH              0, 11, rsp+20*mmsize
+
+    mova                m1, [%1+ 2*mmsize]
+    mova                m2, [%1+ 4*mmsize]
+    mova                m5, [%1+10*mmsize]
+    mova                m6, [%1+12*mmsize]
+    SUMSUB_MUL_D         5, 2, 3, 4, 14449,  7723, %3   ; m5/8=t2a, m2/9=t3a
+    SUMSUB_MUL_D         1, 6, 7, 0,  4756, 15679, %3   ; m1/10=t6a, m6/11=t7a
+    SCRATCH              2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        1, 5, 7, 3, 2, %2              ; m1=t2, m5=t6
+    UNSCRATCH            2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        6, 2, 0, 4, 3, %2              ; m6=t3, m2=t7
+
+    UNSCRATCH            7, 10, rsp+19*mmsize
+    UNSCRATCH            0, 11, rsp+20*mmsize
+    SCRATCH              1, 10, rsp+19*mmsize
+    SCRATCH              6, 11, rsp+20*mmsize
+
+    SUMSUB_MUL_D         7, 0, 3, 4, 15137,  6270, %3   ; m7/8=t4a, m0/9=t5a
+    SUMSUB_MUL_D         2, 5, 1, 6,  6270, 15137, %3   ; m2/10=t7a, m5/11=t6a
+    SCRATCH              2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        5, 7, 6, 3, 2, %2              ; m5=-out1, m7=t6
+    UNSCRATCH            2, 12, rsp+21*mmsize
+    NEGD                m5                              ; m5=out1
+    SUMSUB_PACK_D        2, 0, 1, 4, 3, %2              ; m2=out6, m0=t7
+    SUMSUB_MUL           7, 0, 3, 4, 11585, 11585, %2, %3   ; m7=out2, m0=-out5
+    NEGD                m0                              ; m0=out5
+
+    UNSCRATCH            3, 8, rsp+17*mmsize
+    UNSCRATCH            4, 9, rsp+18*mmsize
+    UNSCRATCH            1, 10, rsp+19*mmsize
+    UNSCRATCH            6, 11, rsp+20*mmsize
+    SCRATCH              2, 8, rsp+17*mmsize
+    SCRATCH              0, 9, rsp+18*mmsize
+
+    SUMSUB_BA         d, 1, 3,  2                       ; m1=out0, m3=t2
+    SUMSUB_BA         d, 6, 4,  2                       ; m6=-out7, m4=t3
+    NEGD                m6                              ; m6=out7
+    SUMSUB_MUL           3, 4,  2,  0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
+    NEGD                m3                              ; m3=out3
+
+    UNSCRATCH            0, 9, rsp+18*mmsize
+
+    SWAP                 0, 1, 5
+    SWAP                 2, 7, 6
+%endmacro
+
+%macro IADST8_FN 5
+cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
+                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+                              dst, stride, block, eob
+    mova                m0, [pw_1023]
+
+.body:
+    SCRATCH              0, 13, rsp+16*mmsize, max
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [%5_8x8]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [%5_8x8+cntq-1]
+%endif
+    mov              skipd, 2
+    sub              skipd, cntd
+    mov               ptrq, rsp
+    PRELOAD             14, pd_8192, rnd
+    PRELOAD             15, pd_3fff, mask
+.loop_1:
+    %2_1D           blockq, reg_rnd, reg_mask
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 6
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 2*mmsize], m1
+    mova  [ptrq+ 4*mmsize], m2
+    mova  [ptrq+ 6*mmsize], m3
+    UNSCRATCH            6, 8, rsp+17*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 1*mmsize], m4
+    mova  [ptrq+ 3*mmsize], m5
+    mova  [ptrq+ 5*mmsize], m6
+    mova  [ptrq+ 7*mmsize], m7
+    add               ptrq, 8 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    add               ptrq, 4 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 2
+    mov               ptrq, rsp
+.loop_2:
+    %4_1D             ptrq, reg_rnd, reg_mask
+
+    pxor                m6, m6
+    PRELOAD              9, pd_16, srnd
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+    lea               dstq, [dstq+strideq*4]
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    UNSCRATCH            1, 13, rsp+16*mmsize, max
+    UNSCRATCH            2, 9, pd_16, srnd
+    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
+    add               ptrq, 16
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+8]
+%else
+    mov               dstq, dstm
+    add               dstq, 8
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m6 is still zero
+    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+    RET
+
+cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
+                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+                              dst, stride, block, eob
+    mova                m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST8_FN idct,  IDCT8,  iadst, IADST8, row
+IADST8_FN iadst, IADST8, idct,  IDCT8,  col
+IADST8_FN iadst, IADST8, iadst, IADST8, default
+
+%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
+    IDCT8_1D            %1, [pd_8192], [pd_3fff], %2 * 2, %4    ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
+    ; SCRATCH            6, 8, rsp+(%4+0)*mmsize    ; t6
+    SCRATCH              0, 15, rsp+(%4+7)*mmsize   ; t0a
+    SCRATCH              1, 14, rsp+(%4+6)*mmsize   ; t1a
+    SCRATCH              2, 13, rsp+(%4+5)*mmsize   ; t2a
+    SCRATCH              3, 12, rsp+(%4+4)*mmsize   ; t3a
+    SCRATCH              4, 11, rsp+(%4+3)*mmsize   ; t4
+    mova [rsp+(%3+0)*mmsize], m5                    ; t5
+    mova [rsp+(%3+1)*mmsize], m7                    ; t7
+
+    mova                m0, [%1+ 1*%2]              ; in1
+    mova                m3, [%1+ 7*%2]              ; in7
+    mova                m4, [%1+ 9*%2]              ; in9
+    mova                m7, [%1+15*%2]              ; in15
+
+    SUMSUB_MUL           0, 7, 1, 2, 16305,  1606   ; m0=t15a, m7=t8a
+    SUMSUB_MUL           4, 3, 1, 2, 10394, 12665   ; m4=t14a, m3=t9a
+    SUMSUB_BA         d, 3, 7, 1                    ; m3=t8, m7=t9
+    SUMSUB_BA         d, 4, 0, 1                    ; m4=t15,m0=t14
+    SUMSUB_MUL           0, 7, 1, 2, 15137,  6270   ; m0=t14a, m7=t9a
+
+    mova                m1, [%1+ 3*%2]              ; in3
+    mova                m2, [%1+ 5*%2]              ; in5
+    mova                m5, [%1+11*%2]              ; in11
+    mova                m6, [%1+13*%2]              ; in13
+
+    SCRATCH              0,  9, rsp+(%4+1)*mmsize
+    SCRATCH              7, 10, rsp+(%4+2)*mmsize
+
+    SUMSUB_MUL           2, 5, 0, 7, 14449,  7723   ; m2=t13a, m5=t10a
+    SUMSUB_MUL           6, 1, 0, 7,  4756, 15679   ; m6=t12a, m1=t11a
+    SUMSUB_BA         d, 5, 1, 0                    ; m5=t11,m1=t10
+    SUMSUB_BA         d, 2, 6, 0                    ; m2=t12,m6=t13
+    NEGD                m1                          ; m1=-t10
+    SUMSUB_MUL           1, 6, 0, 7, 15137,  6270   ; m1=t13a, m6=t10a
+
+    UNSCRATCH            7, 10, rsp+(%4+2)*mmsize
+    SUMSUB_BA         d, 5, 3, 0                    ; m5=t8a, m3=t11a
+    SUMSUB_BA         d, 6, 7, 0                    ; m6=t9,  m7=t10
+    SUMSUB_BA         d, 2, 4, 0                    ; m2=t15a,m4=t12a
+    SCRATCH              5, 10, rsp+(%4+2)*mmsize
+    SUMSUB_MUL           4, 3, 0, 5, 11585, 11585   ; m4=t12, m3=t11
+    UNSCRATCH            0, 9, rsp+(%4+1)*mmsize
+    SUMSUB_BA         d, 1, 0, 5                    ; m1=t14, m0=t13
+    SCRATCH              6, 9, rsp+(%4+1)*mmsize
+    SUMSUB_MUL           0, 7, 6, 5, 11585, 11585   ; m0=t13a,m7=t10a
+
+    ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
+    ; free: 6,5
+
+    UNSCRATCH            5, 15, rsp+(%4+7)*mmsize
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=out0, m5=out15
+    SCRATCH              5, 15, rsp+(%4+7)*mmsize
+    UNSCRATCH            5, 14, rsp+(%4+6)*mmsize
+    SUMSUB_BA         d, 1, 5, 6                    ; m1=out1, m5=out14
+    SCRATCH              5, 14, rsp+(%4+6)*mmsize
+    UNSCRATCH            5, 13, rsp+(%4+5)*mmsize
+    SUMSUB_BA         d, 0, 5, 6                    ; m0=out2, m5=out13
+    SCRATCH              5, 13, rsp+(%4+5)*mmsize
+    UNSCRATCH            5, 12, rsp+(%4+4)*mmsize
+    SUMSUB_BA         d, 4, 5, 6                    ; m4=out3, m5=out12
+    SCRATCH              5, 12, rsp+(%4+4)*mmsize
+    UNSCRATCH            5, 11, rsp+(%4+3)*mmsize
+    SUMSUB_BA         d, 3, 5, 6                    ; m3=out4, m5=out11
+    SCRATCH              4, 11, rsp+(%4+3)*mmsize
+    mova                m4, [rsp+(%3+0)*mmsize]
+    SUMSUB_BA         d, 7, 4, 6                    ; m7=out5, m4=out10
+    mova [rsp+(%3+0)*mmsize], m5
+    UNSCRATCH            5, 8, rsp+(%4+0)*mmsize
+    UNSCRATCH            6, 9, rsp+(%4+1)*mmsize
+    SCRATCH              2, 8, rsp+(%4+0)*mmsize
+    SCRATCH              1, 9, rsp+(%4+1)*mmsize
+    UNSCRATCH            1, 10, rsp+(%4+2)*mmsize
+    SCRATCH              0, 10, rsp+(%4+2)*mmsize
+    mova                m0, [rsp+(%3+1)*mmsize]
+    SUMSUB_BA         d, 6, 5, 2                    ; m6=out6, m5=out9
+    SUMSUB_BA         d, 1, 0, 2                    ; m1=out7, m0=out8
+
+    SWAP                 0, 3, 1, 7, 2, 6, 4
+
+    ; output order: 8-11|r67-70=out0-3
+    ;               0-6,r65=out4-11
+    ;               12-15|r71-74=out12-15
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 8
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    mova   [rsp+64*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_16x16]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_16x16+cntq-1]
+%endif
+    mov              skipd, 4
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    IDCT16_1D       blockq
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+ 9*mmsize], m2
+    mova  [ptrq+13*mmsize], m3
+    mova                m7, [rsp+65*mmsize]
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m4
+    mova  [ptrq+ 6*mmsize], m5
+    mova  [ptrq+10*mmsize], m6
+    mova  [ptrq+14*mmsize], m7
+    UNSCRATCH               0, 8, rsp+67*mmsize
+    UNSCRATCH               1, 9, rsp+68*mmsize
+    UNSCRATCH               2, 10, rsp+69*mmsize
+    UNSCRATCH               3, 11, rsp+70*mmsize
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 4*mmsize], m1
+    mova  [ptrq+ 8*mmsize], m2
+    mova  [ptrq+12*mmsize], m3
+    UNSCRATCH               4, 12, rsp+71*mmsize
+    UNSCRATCH               5, 13, rsp+72*mmsize
+    UNSCRATCH               6, 14, rsp+73*mmsize
+    UNSCRATCH               7, 15, rsp+74*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 3*mmsize], m4
+    mova  [ptrq+ 7*mmsize], m5
+    mova  [ptrq+11*mmsize], m6
+    mova  [ptrq+15*mmsize], m7
+    add               ptrq, 16 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 4
+    mov               ptrq, rsp
+.loop_2:
+    IDCT16_1D         ptrq
+
+    pxor               m7, m7
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+    lea               dstq, [dstq+strideq*4]
+    mova                m0, [rsp+65*mmsize]
+    mova                m1, [rsp+64*mmsize]
+    mova                m2, [pd_32]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+    mov               dstq, dstm
+%endif
+    UNSCRATCH               0, 8, rsp+67*mmsize
+    UNSCRATCH               4, 9, rsp+68*mmsize
+    UNSCRATCH               5, 10, rsp+69*mmsize
+    UNSCRATCH               3, 11, rsp+70*mmsize
+    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea               dstq, [dstbakq+stride3q*4]
+%else
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    UNSCRATCH               4, 12, rsp+71*mmsize
+    UNSCRATCH               5, 13, rsp+72*mmsize
+    UNSCRATCH               6, 14, rsp+73*mmsize
+    UNSCRATCH               0, 15, rsp+74*mmsize
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+    RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 8
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+; r65-69 are available for spills
+; r70-77 are available on x86-32 only (x86-64 should use m8-15)
+; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
+%macro IADST16_1D 1 ; src
+    mova                m0, [%1+ 0*4*mmsize]        ; in0
+    mova                m1, [%1+ 7*4*mmsize]        ; in7
+    mova                m2, [%1+ 8*4*mmsize]        ; in8
+    mova                m3, [%1+15*4*mmsize]        ; in15
+    SUMSUB_MUL_D         3, 0, 4, 5, 16364,  804    ; m3/4=t0, m0/5=t1
+    SUMSUB_MUL_D         1, 2, 6, 7, 11003, 12140   ; m1/6=t8, m2/7=t9
+    SCRATCH              0, 8, rsp+70*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t0a, m3=t8a
+    UNSCRATCH            0, 8, rsp+70*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t1a, m0=t9a
+    mova   [rsp+67*mmsize], m1
+    SCRATCH              2, 9, rsp+71*mmsize
+    SCRATCH              3, 12, rsp+74*mmsize
+    SCRATCH              0, 13, rsp+75*mmsize
+
+    mova                m0, [%1+ 3*4*mmsize]        ; in3
+    mova                m1, [%1+ 4*4*mmsize]        ; in4
+    mova                m2, [%1+11*4*mmsize]        ; in11
+    mova                m3, [%1+12*4*mmsize]        ; in12
+    SUMSUB_MUL_D         2, 1, 4, 5, 14811,  7005   ; m2/4=t4, m1/5=t5
+    SUMSUB_MUL_D         0, 3, 6, 7,  5520, 15426   ; m0/6=t12, m3/7=t13
+    SCRATCH              1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t4a, m2=t12a
+    UNSCRATCH            1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t5a, m1=t13a
+    SCRATCH              0, 15, rsp+77*mmsize
+    SCRATCH              3, 11, rsp+73*mmsize
+
+    UNSCRATCH            0, 12, rsp+74*mmsize       ; t8a
+    UNSCRATCH            3, 13, rsp+75*mmsize       ; t9a
+    SUMSUB_MUL_D         0, 3, 4, 5, 16069,  3196   ; m0/4=t8, m3/5=t9
+    SUMSUB_MUL_D         1, 2, 6, 7,  3196, 16069   ; m1/6=t13, m2/7=t12
+    SCRATCH              1, 12, rsp+74*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 4, 1              ; m2=t8a, m0=t12a
+    UNSCRATCH            1, 12, rsp+74*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 5, 4              ; m1=t9a, m3=t13a
+    mova   [rsp+65*mmsize], m2
+    mova   [rsp+66*mmsize], m1
+    SCRATCH              0, 8, rsp+70*mmsize
+    SCRATCH              3, 12, rsp+74*mmsize
+
+    mova                m0, [%1+ 2*4*mmsize]        ; in2
+    mova                m1, [%1+ 5*4*mmsize]        ; in5
+    mova                m2, [%1+10*4*mmsize]        ; in10
+    mova                m3, [%1+13*4*mmsize]        ; in13
+    SUMSUB_MUL_D         3, 0, 4, 5, 15893,  3981   ; m3/4=t2, m0/5=t3
+    SUMSUB_MUL_D         1, 2, 6, 7,  8423, 14053   ; m1/6=t10, m2/7=t11
+    SCRATCH              0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t2a, m3=t10a
+    UNSCRATCH            0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t3a, m0=t11a
+    mova   [rsp+68*mmsize], m1
+    mova   [rsp+69*mmsize], m2
+    SCRATCH              3, 13, rsp+75*mmsize
+    SCRATCH              0, 14, rsp+76*mmsize
+
+    mova                m0, [%1+ 1*4*mmsize]        ; in1
+    mova                m1, [%1+ 6*4*mmsize]        ; in6
+    mova                m2, [%1+ 9*4*mmsize]        ; in9
+    mova                m3, [%1+14*4*mmsize]        ; in14
+    SUMSUB_MUL_D         2, 1, 4, 5, 13160,  9760   ; m2/4=t6, m1/5=t7
+    SUMSUB_MUL_D         0, 3, 6, 7,  2404, 16207   ; m0/6=t14, m3/7=t15
+    SCRATCH              1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t6a, m2=t14a
+    UNSCRATCH            1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t7a, m1=t15a
+
+    UNSCRATCH            4, 13, rsp+75*mmsize       ; t10a
+    UNSCRATCH            5, 14, rsp+76*mmsize       ; t11a
+    SCRATCH              0, 13, rsp+75*mmsize
+    SCRATCH              3, 14, rsp+76*mmsize
+    SUMSUB_MUL_D         4, 5, 6, 7,  9102, 13623   ; m4/6=t10, m5/7=t11
+    SUMSUB_MUL_D         1, 2, 0, 3, 13623,  9102   ; m1/0=t15, m2/3=t14
+    SCRATCH              0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        2, 4, 3, 6, 0              ; m2=t10a, m4=t14a
+    UNSCRATCH            0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        1, 5, 0, 7, 6              ; m1=t11a, m5=t15a
+
+    UNSCRATCH            0, 8, rsp+70*mmsize        ; t12a
+    UNSCRATCH            3, 12, rsp+74*mmsize       ; t13a
+    SCRATCH              2, 8, rsp+70*mmsize
+    SCRATCH              1, 12, rsp+74*mmsize
+    SUMSUB_MUL_D         0, 3, 1, 2, 15137,  6270   ; m0/1=t12, m3/2=t13
+    SUMSUB_MUL_D         5, 4, 7, 6,  6270, 15137   ; m5/7=t15, m4/6=t14
+    SCRATCH              2, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        4, 0, 6, 1, 2              ; m4=out2, m0=t14a
+    UNSCRATCH            2, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        5, 3, 7, 2, 1              ; m5=-out13, m3=t15a
+    NEGD                m5                          ; m5=out13
+
+    UNSCRATCH            1, 9, rsp+71*mmsize        ; t1a
+    mova                m2, [rsp+68*mmsize]         ; t2a
+    UNSCRATCH            6, 13, rsp+75*mmsize       ; t6a
+    UNSCRATCH            7, 14, rsp+76*mmsize       ; t7a
+    SCRATCH              4, 10, rsp+72*mmsize
+    SCRATCH              5, 13, rsp+75*mmsize
+    UNSCRATCH            4, 15, rsp+77*mmsize       ; t4a
+    UNSCRATCH            5, 11, rsp+73*mmsize       ; t5a
+    SCRATCH              0, 14, rsp+76*mmsize
+    SCRATCH              3, 15, rsp+77*mmsize
+    mova                m0, [rsp+67*mmsize]         ; t0a
+    SUMSUB_BA         d, 4, 0, 3                    ; m4=t0, m0=t4
+    SUMSUB_BA         d, 5, 1, 3                    ; m5=t1, m1=t5
+    SUMSUB_BA         d, 6, 2, 3                    ; m6=t2, m2=t6
+    SCRATCH              4, 9, rsp+71*mmsize
+    mova                m3, [rsp+69*mmsize]         ; t3a
+    SUMSUB_BA         d, 7, 3, 4                    ; m7=t3, m3=t7
+
+    mova   [rsp+67*mmsize], m5
+    mova   [rsp+68*mmsize], m6
+    mova   [rsp+69*mmsize], m7
+    SUMSUB_MUL_D         0, 1, 4, 5, 15137,  6270   ; m0/4=t4a, m1/5=t5a
+    SUMSUB_MUL_D         3, 2, 7, 6,  6270, 15137   ; m3/7=t7a, m2/6=t6a
+    SCRATCH              1, 11, rsp+73*mmsize
+    SUMSUB_PACK_D        2, 0, 6, 4, 1              ; m2=-out3, m0=t6
+    NEGD                m2                          ; m2=out3
+    UNSCRATCH            1, 11, rsp+73*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=out12, m1=t7
+    SCRATCH              2, 11, rsp+73*mmsize
+    UNSCRATCH            2, 12, rsp+74*mmsize       ; t11a
+    SCRATCH              3, 12, rsp+74*mmsize
+
+    UNSCRATCH            3, 8, rsp+70*mmsize        ; t10a
+    mova                m4, [rsp+65*mmsize]         ; t8a
+    mova                m5, [rsp+66*mmsize]         ; t9a
+    SUMSUB_BA         d, 3, 4, 6                    ; m3=-out1, m4=t10
+    NEGD                m3                          ; m3=out1
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=out14, m5=t11
+    UNSCRATCH            6, 9, rsp+71*mmsize        ; t0
+    UNSCRATCH            7, 14, rsp+76*mmsize       ; t14a
+    SCRATCH              3, 9, rsp+71*mmsize
+    SCRATCH              2, 14, rsp+76*mmsize
+
+    SUMSUB_MUL           1, 0, 2, 3, 11585, 11585   ; m1=out4, m0=out11
+    mova   [rsp+65*mmsize], m0
+    SUMSUB_MUL           5, 4, 2, 3, 11585, 11585   ; m5=out6, m4=out9
+    UNSCRATCH            0, 15, rsp+77*mmsize       ; t15a
+    SUMSUB_MUL           7, 0, 2, 3, 11585, m11585  ; m7=out10, m0=out5
+
+    mova                m2, [rsp+68*mmsize]         ; t2
+    SUMSUB_BA         d, 2, 6, 3                    ; m2=out0, m6=t2a
+    SCRATCH              2, 8, rsp+70*mmsize
+    mova                m2, [rsp+67*mmsize]         ; t1
+    mova                m3, [rsp+69*mmsize]         ; t3
+    mova   [rsp+67*mmsize], m7
+    SUMSUB_BA         d, 3, 2, 7                    ; m3=-out15, m2=t3a
+    NEGD                m3                          ; m3=out15
+    SCRATCH              3, 15, rsp+77*mmsize
+    SUMSUB_MUL           6, 2, 7, 3, 11585, m11585  ; m6=out8, m2=out7
+    mova                m7, [rsp+67*mmsize]
+
+    SWAP                 0, 1
+    SWAP                 2, 5, 4, 6, 7, 3
+%endmacro
+
+%macro IADST16_FN 7
+cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                dst, stride, block, eob
+    mova                m0, [pw_1023]
+
+.body:
+    mova   [rsp+64*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [%7_16x16]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [%7_16x16+cntq-1]
+%endif
+    mov              skipd, 4
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    %2_1D           blockq
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+ 9*mmsize], m2
+    mova  [ptrq+13*mmsize], m3
+    mova                m7, [rsp+65*mmsize]
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m4
+    mova  [ptrq+ 6*mmsize], m5
+    mova  [ptrq+10*mmsize], m6
+    mova  [ptrq+14*mmsize], m7
+    UNSCRATCH               0, 8, rsp+(%3+0)*mmsize
+    UNSCRATCH               1, 9, rsp+(%3+1)*mmsize
+    UNSCRATCH               2, 10, rsp+(%3+2)*mmsize
+    UNSCRATCH               3, 11, rsp+(%3+3)*mmsize
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 4*mmsize], m1
+    mova  [ptrq+ 8*mmsize], m2
+    mova  [ptrq+12*mmsize], m3
+    UNSCRATCH               4, 12, rsp+(%3+4)*mmsize
+    UNSCRATCH               5, 13, rsp+(%3+5)*mmsize
+    UNSCRATCH               6, 14, rsp+(%3+6)*mmsize
+    UNSCRATCH               7, 15, rsp+(%3+7)*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 3*mmsize], m4
+    mova  [ptrq+ 7*mmsize], m5
+    mova  [ptrq+11*mmsize], m6
+    mova  [ptrq+15*mmsize], m7
+    add               ptrq, 16 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 4
+    mov               ptrq, rsp
+.loop_2:
+    %5_1D             ptrq
+
+    pxor                m7, m7
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+    lea               dstq, [dstq+strideq*4]
+    mova                m0, [rsp+65*mmsize]
+    mova                m1, [rsp+64*mmsize]
+    mova                m2, [pd_32]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+    mov               dstq, dstm
+%endif
+    UNSCRATCH               0, 8, rsp+(%6+0)*mmsize
+    UNSCRATCH               4, 9, rsp+(%6+1)*mmsize
+    UNSCRATCH               5, 10, rsp+(%6+2)*mmsize
+    UNSCRATCH               3, 11, rsp+(%6+3)*mmsize
+    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea               dstq, [dstbakq+stride3q*4]
+%else
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    UNSCRATCH               4, 12, rsp+(%6+4)*mmsize
+    UNSCRATCH               5, 13, rsp+(%6+5)*mmsize
+    UNSCRATCH               6, 14, rsp+(%6+6)*mmsize
+    UNSCRATCH               0, 15, rsp+(%6+7)*mmsize
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+    RET
+
+cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                dst, stride, block, eob
+    mova                m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST16_FN idct,  IDCT16,  67, iadst, IADST16, 70, row
+IADST16_FN iadst, IADST16, 70, idct,  IDCT16,  67, col
+IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
+
+%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
+    IDCT16_1D %2, 2 * %3, 272, 257
+%if ARCH_X86_64
+    mova  [rsp+257*mmsize], m8
+    mova  [rsp+258*mmsize], m9
+    mova  [rsp+259*mmsize], m10
+    mova  [rsp+260*mmsize], m11
+    mova  [rsp+261*mmsize], m12
+    mova  [rsp+262*mmsize], m13
+    mova  [rsp+263*mmsize], m14
+    mova  [rsp+264*mmsize], m15
+%endif
+    mova  [rsp+265*mmsize], m0
+    mova  [rsp+266*mmsize], m1
+    mova  [rsp+267*mmsize], m2
+    mova  [rsp+268*mmsize], m3
+    mova  [rsp+269*mmsize], m4
+    mova  [rsp+270*mmsize], m5
+    mova  [rsp+271*mmsize], m6
+
+    ; r257-260: t0-3
+    ; r265-272: t4/5a/6a/7/8/9a/10/11a
+    ; r261-264: t12a/13/14a/15
+    ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
+
+    mova                m0, [%2+ 1*%3]              ; in1
+    mova                m1, [%2+15*%3]              ; in15
+    mova                m2, [%2+17*%3]              ; in17
+    mova                m3, [%2+31*%3]              ; in31
+    SUMSUB_MUL           0, 3, 4, 5, 16364,  804    ; m0=t31a, m3=t16a
+    SUMSUB_MUL           2, 1, 4, 5, 11003, 12140   ; m2=t30a, m1=t17a
+    SUMSUB_BA         d, 1, 3, 4                    ; m1=t16, m3=t17
+    SUMSUB_BA         d, 2, 0, 4                    ; m2=t31, m0=t30
+    SUMSUB_MUL           0, 3, 4, 5, 16069,  3196   ; m0=t30a, m3=t17a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              2, 9, rsp+276*mmsize
+
+    ; end of stage 1-3 first quart
+
+    mova                m0, [%2+ 7*%3]              ; in7
+    mova                m2, [%2+ 9*%3]              ; in9
+    mova                m4, [%2+23*%3]              ; in23
+    mova                m5, [%2+25*%3]              ; in25
+    SUMSUB_MUL           2, 4, 6, 7, 14811,  7005   ; m2=t29a, m4=t18a
+    SUMSUB_MUL           5, 0, 6, 7,  5520, 15426   ; m5=t28a, m0=t19a
+    SUMSUB_BA         d, 4, 0, 6                    ; m4=t19, m0=t18
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=t28, m5=t29
+    SUMSUB_MUL           5, 0, 6, 7,  3196, m16069  ; m5=t29a, m0=t18a
+
+    ; end of stage 1-3 second quart
+
+    SUMSUB_BA         d, 4, 1, 6                    ; m4=t16a, m1=t19a
+    SUMSUB_BA         d, 0, 3, 6                    ; m0=t17, m3=t18
+    UNSCRATCH            6, 8, rsp+275*mmsize       ; t30a
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; t31
+    mova  [rsp+273*mmsize], m4
+    mova  [rsp+274*mmsize], m0
+    SUMSUB_BA         d, 2, 7, 0                    ; m2=t31a, m7=t28a
+    SUMSUB_BA         d, 5, 6, 0                    ; m5=t30, m6=t29
+    SUMSUB_MUL           6, 3, 0, 4, 15137,  6270   ; m6=t29a, m3=t18a
+    SUMSUB_MUL           7, 1, 0, 4, 15137,  6270   ; m7=t28, m1=t19
+    SCRATCH              3, 10, rsp+277*mmsize
+    SCRATCH              1, 11, rsp+278*mmsize
+    SCRATCH              7, 12, rsp+279*mmsize
+    SCRATCH              6, 13, rsp+280*mmsize
+    SCRATCH              5, 14, rsp+281*mmsize
+    SCRATCH              2, 15, rsp+282*mmsize
+
+    ; end of stage 4-5 first half
+
+    mova                m0, [%2+ 5*%3]              ; in5
+    mova                m1, [%2+11*%3]              ; in11
+    mova                m2, [%2+21*%3]              ; in21
+    mova                m3, [%2+27*%3]              ; in27
+    SUMSUB_MUL           0, 3, 4, 5, 15893,  3981   ; m0=t27a, m3=t20a
+    SUMSUB_MUL           2, 1, 4, 5,  8423, 14053   ; m2=t26a, m1=t21a
+    SUMSUB_BA         d, 1, 3, 4                    ; m1=t20, m3=t21
+    SUMSUB_BA         d, 2, 0, 4                    ; m2=t27, m0=t26
+    SUMSUB_MUL           0, 3, 4, 5,  9102, 13623   ; m0=t26a, m3=t21a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              2, 9, rsp+276*mmsize
+
+    ; end of stage 1-3 third quart
+
+    mova                m0, [%2+ 3*%3]              ; in3
+    mova                m2, [%2+13*%3]              ; in13
+    mova                m4, [%2+19*%3]              ; in19
+    mova                m5, [%2+29*%3]              ; in29
+    SUMSUB_MUL           2, 4, 6, 7, 13160,  9760   ; m2=t25a, m4=t22a
+    SUMSUB_MUL           5, 0, 6, 7,  2404, 16207   ; m5=t24a, m0=t23a
+    SUMSUB_BA         d, 4, 0, 6                    ; m4=t23, m0=t22
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=t24, m5=t25
+    SUMSUB_MUL           5, 0, 6, 7, 13623, m9102   ; m5=t25a, m0=t22a
+
+    ; end of stage 1-3 fourth quart
+
+    SUMSUB_BA         d, 1, 4, 6                    ; m1=t23a, m4=t20a
+    SUMSUB_BA         d, 3, 0, 6                    ; m3=t22, m0=t21
+    UNSCRATCH            6, 8, rsp+275*mmsize       ; t26a
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; t27
+    SCRATCH              3, 8, rsp+275*mmsize
+    SCRATCH              1, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 7, 2, 1                    ; m7=t24a, m2=t27a
+    SUMSUB_BA         d, 6, 5, 1                    ; m6=t25, m5=t26
+    SUMSUB_MUL           2, 4, 1, 3,  6270, m15137  ; m2=t27, m4=t20
+    SUMSUB_MUL           5, 0, 1, 3,  6270, m15137  ; m5=t26a, m0=t21a
+
+    ; end of stage 4-5 second half
+
+    UNSCRATCH            1, 12, rsp+279*mmsize      ; t28
+    UNSCRATCH            3, 13, rsp+280*mmsize      ; t29a
+    SCRATCH              4, 12, rsp+279*mmsize
+    SCRATCH              0, 13, rsp+280*mmsize
+    SUMSUB_BA         d, 5, 3, 0                    ; m5=t29, m3=t26
+    SUMSUB_BA         d, 2, 1, 0                    ; m2=t28a, m1=t27a
+    UNSCRATCH            0, 14, rsp+281*mmsize      ; t30
+    UNSCRATCH            4, 15, rsp+282*mmsize      ; t31a
+    SCRATCH              2, 14, rsp+281*mmsize
+    SCRATCH              5, 15, rsp+282*mmsize
+    SUMSUB_BA         d, 6, 0, 2                    ; m6=t30a, m0=t25a
+    SUMSUB_BA         d, 7, 4, 2                    ; m7=t31, m4=t24
+
+    mova                m2, [rsp+273*mmsize]        ; t16a
+    mova                m5, [rsp+274*mmsize]        ; t17
+    mova  [rsp+273*mmsize], m6
+    mova  [rsp+274*mmsize], m7
+    UNSCRATCH            6, 10, rsp+277*mmsize      ; t18a
+    UNSCRATCH            7, 11, rsp+278*mmsize      ; t19
+    SCRATCH              4, 10, rsp+277*mmsize
+    SCRATCH              0, 11, rsp+278*mmsize
+    UNSCRATCH            4, 12, rsp+279*mmsize      ; t20
+    UNSCRATCH            0, 13, rsp+280*mmsize      ; t21a
+    SCRATCH              3, 12, rsp+279*mmsize
+    SCRATCH              1, 13, rsp+280*mmsize
+    SUMSUB_BA         d, 0, 6, 1                    ; m0=t18, m6=t21
+    SUMSUB_BA         d, 4, 7, 1                    ; m4=t19a, m7=t20a
+    UNSCRATCH            3, 8, rsp+275*mmsize       ; t22
+    UNSCRATCH            1, 9, rsp+276*mmsize       ; t23a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              4, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 3, 5, 0                    ; m3=t17a, m5=t22a
+    SUMSUB_BA         d, 1, 2, 0                    ; m1=t16, m2=t23
+
+    ; end of stage 6
+
+    UNSCRATCH            0, 10, rsp+277*mmsize      ; t24
+    UNSCRATCH            4, 11, rsp+278*mmsize      ; t25a
+    SCRATCH              1, 10, rsp+277*mmsize
+    SCRATCH              3, 11, rsp+278*mmsize
+    SUMSUB_MUL           0, 2, 1, 3, 11585, 11585   ; m0=t24a, m2=t23a
+    SUMSUB_MUL           4, 5, 1, 3, 11585, 11585   ; m4=t25, m5=t22
+    UNSCRATCH            1, 12, rsp+279*mmsize      ; t26
+    UNSCRATCH            3, 13, rsp+280*mmsize      ; t27a
+    SCRATCH              0, 12, rsp+279*mmsize
+    SCRATCH              4, 13, rsp+280*mmsize
+    SUMSUB_MUL           3, 7, 0, 4, 11585, 11585   ; m3=t27, m7=t20
+    SUMSUB_MUL           1, 6, 0, 4, 11585, 11585   ; m1=t26a, m6=t21a
+
+    ; end of stage 7
+
+    mova                m0, [rsp+269*mmsize]        ; t8
+    mova                m4, [rsp+270*mmsize]        ; t9a
+    mova  [rsp+269*mmsize], m1                      ; t26a
+    mova  [rsp+270*mmsize], m3                      ; t27
+    mova                m3, [rsp+271*mmsize]        ; t10
+    SUMSUB_BA         d, 2, 0, 1                    ; m2=out8, m0=out23
+    SUMSUB_BA         d, 5, 4, 1                    ; m5=out9, m4=out22
+    SUMSUB_BA         d, 6, 3, 1                    ; m6=out10, m3=out21
+    mova                m1, [rsp+272*mmsize]        ; t11a
+    mova  [rsp+271*mmsize], m0
+    SUMSUB_BA         d, 7, 1, 0                    ; m7=out11, m1=out20
+
+%if %1 == 1
+    TRANSPOSE4x4D        2, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m2
+    mova  [ptrq+10*mmsize], m5
+    mova  [ptrq+18*mmsize], m6
+    mova  [ptrq+26*mmsize], m7
+%else ; %1 == 2
+    pxor                m0, m0
+    lea               dstq, [dstq+strideq*8]
+    ROUND_AND_STORE_4x4  2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+    mova                m2, [rsp+271*mmsize]
+%if %1 == 1
+    TRANSPOSE4x4D        1, 3, 4, 2, 0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+13*mmsize], m3
+    mova  [ptrq+21*mmsize], m4
+    mova  [ptrq+29*mmsize], m2
+%else ; %1 == 2
+    lea               dstq, [dstq+stride3q*4]
+    ROUND_AND_STORE_4x4  1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out8-11 and out20-23
+
+    UNSCRATCH            0, 9, rsp+276*mmsize       ; t19a
+    UNSCRATCH            1, 8, rsp+275*mmsize       ; t18
+    UNSCRATCH            2, 11, rsp+278*mmsize      ; t17a
+    UNSCRATCH            3, 10, rsp+277*mmsize      ; t16
+    mova                m7, [rsp+261*mmsize]        ; t12a
+    mova                m6, [rsp+262*mmsize]        ; t13
+    mova                m5, [rsp+263*mmsize]        ; t14a
+    SUMSUB_BA         d, 0, 7, 4                    ; m0=out12, m7=out19
+    SUMSUB_BA         d, 1, 6, 4                    ; m1=out13, m6=out18
+    SUMSUB_BA         d, 2, 5, 4                    ; m2=out14, m5=out17
+    mova                m4, [rsp+264*mmsize]        ; t15
+    SCRATCH              7, 8, rsp+275*mmsize
+    SUMSUB_BA         d, 3, 4, 7                    ; m3=out15, m4=out16
+
+%if %1 == 1
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 3*mmsize], m0
+    mova  [ptrq+11*mmsize], m1
+    mova  [ptrq+19*mmsize], m2
+    mova  [ptrq+27*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 7, 9
+    lea               dstq, [dstbakq+stride3q*4]
+%else ; x86-32
+    pxor                m7, m7
+    mov               dstq, dstm
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            0, 8, rsp+275*mmsize       ; out19
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 0, 7
+    mova  [ptrq+ 4*mmsize], m4
+    mova  [ptrq+12*mmsize], m5
+    mova  [ptrq+20*mmsize], m6
+    mova  [ptrq+28*mmsize], m0
+%else ; %1 == 2
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out12-19
+
+%if ARCH_X86_64
+    SWAP                 7, 8
+%endif
+    mova                m7, [rsp+257*mmsize]        ; t0
+    mova                m6, [rsp+258*mmsize]        ; t1
+    mova                m5, [rsp+259*mmsize]        ; t2
+    mova                m4, [rsp+260*mmsize]        ; t3
+    mova                m0, [rsp+274*mmsize]        ; t31
+    mova                m1, [rsp+273*mmsize]        ; t30a
+    UNSCRATCH            2, 15, rsp+282*mmsize      ; t29
+    SUMSUB_BA         d, 0, 7, 3                    ; m0=out0, m7=out31
+    SUMSUB_BA         d, 1, 6, 3                    ; m1=out1, m6=out30
+    SUMSUB_BA         d, 2, 5, 3                    ; m2=out2, m5=out29
+    SCRATCH              0, 9, rsp+276*mmsize
+    UNSCRATCH            3, 14, rsp+281*mmsize      ; t28a
+    SUMSUB_BA         d, 3, 4, 0                    ; m3=out3, m4=out28
+
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 7*mmsize], m4
+    mova  [ptrq+15*mmsize], m5
+    mova  [ptrq+23*mmsize], m6
+    mova  [ptrq+31*mmsize], m7
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 0, 8
+%else ; x86-32
+    pxor                m0, m0
+%endif
+    lea               dstq, [dstq+stride3q*4]
+    ROUND_AND_STORE_4x4  4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; out0
+%if %1 == 1
+    TRANSPOSE4x4D        7, 1, 2, 3, 0
+    mova  [ptrq+ 0*mmsize], m7
+    mova  [ptrq+ 8*mmsize], m1
+    mova  [ptrq+16*mmsize], m2
+    mova  [ptrq+24*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else ; x86-32
+    mov               dstq, dstm
+%endif
+    ROUND_AND_STORE_4x4  7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+%endif
+%endif
+
+    ; end of last stage + store for out0-3 and out28-31
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+%endif
+    mova                m7, [rsp+265*mmsize]        ; t4
+    mova                m6, [rsp+266*mmsize]        ; t5a
+    mova                m5, [rsp+267*mmsize]        ; t6a
+    mova                m4, [rsp+268*mmsize]        ; t7
+    mova                m0, [rsp+270*mmsize]        ; t27
+    mova                m1, [rsp+269*mmsize]        ; t26a
+    UNSCRATCH            2, 13, rsp+280*mmsize      ; t25
+    SUMSUB_BA         d, 0, 7, 3                    ; m0=out4, m7=out27
+    SUMSUB_BA         d, 1, 6, 3                    ; m1=out5, m6=out26
+    SUMSUB_BA         d, 2, 5, 3                    ; m2=out6, m5=out25
+    UNSCRATCH            3, 12, rsp+279*mmsize      ; t24a
+    SCRATCH              7, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 3, 4, 7                    ; m3=out7, m4=out24
+
+%if %1 == 1
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 9*mmsize], m1
+    mova  [ptrq+17*mmsize], m2
+    mova  [ptrq+25*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 7, 8
+    lea               dstq, [dstbakq+strideq*4]
+%else ; x86-32
+    pxor                m7, m7
+    lea               dstq, [dstq+strideq*4]
+%endif
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            0, 9, rsp+276*mmsize       ; out27
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 0, 7
+    mova  [ptrq+ 6*mmsize], m4
+    mova  [ptrq+14*mmsize], m5
+    mova  [ptrq+22*mmsize], m6
+    mova  [ptrq+30*mmsize], m0
+%else ; %1 == 2
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+stride3q*8]
+%else
+    mov               dstq, dstm
+    lea               dstq, [dstq+stride3q*8]
+%endif
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out4-7 and out24-27
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 32
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+    add               dstq, strideq
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    mova  [rsp+256*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_32x32]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_32x32+cntq-1]
+%endif
+    mov              skipd, 8
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    IDCT32_1D            1, blockq
+
+    add               ptrq, 32 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    shl              skipd, 2
+    lea             blockq, [blockq+skipq*(mmsize/4)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 8
+    mov               ptrq, rsp
+.loop_2:
+    IDCT32_1D            2, ptrq
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
+    RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 32
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+    add               dstq, strideq
+    dec               cntd
+    jg .loop_dc
+    RET
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
new file mode 100644
index 0000000000..d2f2257d84
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -0,0 +1,142 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro VP9_IWHT4_1D 0
+    SWAP                 1, 2, 3
+    paddw               m0, m2
+    psubw               m3, m1
+    psubw               m4, m0, m3
+    psraw               m4, 1
+    psubw               m5, m4, m1
+    SWAP                 5, 1
+    psubw               m4, m2
+    SWAP                 4, 2
+    psubw               m0, m1
+    paddw               m3, m2
+    SWAP                 3, 2, 1
+%endmacro
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+    pmaddwd            m%1, m%2, %4
+    pmaddwd            m%2,  %5
+    paddd              m%1,  %3
+    paddd              m%2,  %3
+    psrad              m%1,  14
+    psrad              m%2,  14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+    VP9_MULSUB_2W_2X    %7,  %6,  %5, [pw_m%3_%4], [pw_%4_%3]
+    VP9_MULSUB_2W_2X    %1,  %2,  %5, [pw_m%3_%4], [pw_%4_%3]
+    packssdw           m%1, m%7
+    packssdw           m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+    punpckhwd          m%6, m%2, m%1
+    punpcklwd          m%2, m%1
+    VP9_MULSUB_2W_4X   %1, %2, %3, %4, %5, %6, %7
+%else
+    punpckhwd          m%8, m%4, m%3
+    punpcklwd          m%2, m%4, m%3
+    VP9_MULSUB_2W_4X   %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3, 2                            ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 2, 0, 4                         ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+    pmulhrsw            m2, m6                              ; m2=t0
+    pmulhrsw            m0, m6                              ; m0=t1
+%else ; <= sse2
+    VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5    ; m0=t1, m1=t0
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IADST4_1D 0
+    movq2dq           xmm0, m0
+    movq2dq           xmm1, m1
+    movq2dq           xmm2, m2
+    movq2dq           xmm3, m3
+%if cpuflag(ssse3)
+    paddw               m3, m0
+%endif
+    punpcklwd         xmm0, xmm1
+    punpcklwd         xmm2, xmm3
+    pmaddwd           xmm1, xmm0, [pw_5283_13377]
+    pmaddwd           xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm6, xmm0, [pw_13377_0]
+%endif
+    pmaddwd           xmm0, [pw_15212_m13377]
+    pmaddwd           xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
+%endif
+    pmaddwd           xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+    psubw               m3, m2
+%else
+    paddd             xmm6, xmm7
+%endif
+    paddd             xmm0, xmm2
+    paddd             xmm3, xmm5
+    paddd             xmm2, xmm5
+%if notcpuflag(ssse3)
+    paddd             xmm6, xmm5
+%endif
+    paddd             xmm1, xmm3
+    paddd             xmm0, xmm3
+    paddd             xmm4, xmm2
+    psrad             xmm1, 14
+    psrad             xmm0, 14
+    psrad             xmm4, 14
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_13377x2]        ; out2
+%else
+    psrad             xmm6, 14
+%endif
+    packssdw          xmm0, xmm0
+    packssdw          xmm1, xmm1
+    packssdw          xmm4, xmm4
+%if notcpuflag(ssse3)
+    packssdw          xmm6, xmm6
+%endif
+    movdq2q             m0, xmm0                ; out3
+    movdq2q             m1, xmm1                ; out0
+    movdq2q             m2, xmm4                ; out1
+%if notcpuflag(ssse3)
+    movdq2q             m3, xmm6                ; out2
+%endif
+    SWAP                 0, 1, 2, 3
+%endmacro
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 54f20fe090..4e7ede2235 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -4,24 +4,23 @@
 ;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
 ;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
@@ -37,8 +36,8 @@ pb_f8:  times 16 db 0xf8
 pb_fe:  times 16 db 0xfe
 pb_ff:  times 16 db 0xff
 
-pw_4:   times  8 dw 4
-pw_8:   times  8 dw 8
+cextern pw_4
+cextern pw_8
 
 ; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
 ; the following mask is used to splat both in the same register
@@ -53,7 +52,7 @@ mask_mix48: times 8 db 0x00
 SECTION .text
 
 %macro SCRATCH 3
-%if ARCH_X86_64
+%ifdef m8
     SWAP                %1, %2
 %else
     mova              [%3], m%1
@@ -61,7 +60,7 @@ SECTION .text
 %endmacro
 
 %macro UNSCRATCH 3
-%if ARCH_X86_64
+%ifdef m8
     SWAP                %1, %2
 %else
     mova               m%1, [%3]
@@ -70,7 +69,7 @@ SECTION .text
 
 ; %1 = abs(%2-%3)
 %macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
-%if ARCH_X86_64
+%ifdef m8
     psubusb             %1, %3, %2
     psubusb             %4, %2, %3
 %else
@@ -103,7 +102,7 @@ SECTION .text
 %endmacro
 
 %macro UNPACK 4
-%if ARCH_X86_64
+%ifdef m8
     punpck%1bw          %2, %3, %4
 %else
     mova                %2, %3
@@ -113,27 +112,27 @@ SECTION .text
 
 %macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
                              ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
-    psubw               %3, [rsp+%4+%5*32]
-    psubw               %3, [rsp+%4+%6*32]
-    paddw               %3, [rsp+%4+%7*32]
+    psubw               %3, [rsp+%4+%5*mmsize*2]
+    psubw               %3, [rsp+%4+%6*mmsize*2]
+    paddw               %3, [rsp+%4+%7*mmsize*2]
 %ifnidn %10, ""
 %if %11 == 0
     punpck%2bw          %1, %10, m0
 %else
     UNPACK          %2, %1, %10, m0
 %endif
-    mova    [rsp+%4+%8*32], %1
+    mova [rsp+%4+%8*mmsize*2], %1
     paddw               %3, %1
 %else
-    paddw               %3, [rsp+%4+%8*32]
+    paddw               %3, [rsp+%4+%8*mmsize*2]
 %endif
     psraw               %1, %3, %9
 %endmacro
 
 ; FIXME interleave l/h better (for instruction pairing)
 %macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
-    FILTER%7_INIT       %1, l, %3, %6 +  0
-    FILTER%7_INIT       %2, h, %4, %6 + 16
+    FILTER%7_INIT       %1, l, %3, %6 +      0
+    FILTER%7_INIT       %2, h, %4, %6 + mmsize
     packuswb            %1, %2
     MASK_APPLY          %1, %9, %8, %2
     mova                %5, %1
@@ -148,8 +147,8 @@ SECTION .text
     mova               %14, %15
 %endif
 %endif
-    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +  0, %7, %8, %9, %10, %11, %14, %16
-    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16
+    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +      0, %7, %8, %9, %10, %11, %14, %16
+    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16
     packuswb            %1, %2
 %ifnidn %13, ""
     MASK_APPLY          %1, %13, %12, %2
@@ -196,21 +195,21 @@ SECTION .text
 
 %macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
     UNPACK          %2, %1, rp3, m0                     ; p3: B->W
-    mova     [rsp+%4+0*32], %1
+    mova [rsp+%4+0*mmsize*2], %1
     paddw               %3, %1, %1                      ; p3*2
     paddw               %3, %1                          ; p3*3
     punpck%2bw          %1, m1,  m0                     ; p2: B->W
-    mova     [rsp+%4+1*32], %1
+    mova [rsp+%4+1*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2
     paddw               %3, %1                          ; p3*3 + p2*2
     UNPACK          %2, %1, rp1, m0                     ; p1: B->W
-    mova     [rsp+%4+2*32], %1
+    mova [rsp+%4+2*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2*2 + p1
     UNPACK          %2, %1, rp0, m0                     ; p0: B->W
-    mova     [rsp+%4+3*32], %1
+    mova [rsp+%4+3*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0
     UNPACK          %2, %1, rq0, m0                     ; q0: B->W
-    mova     [rsp+%4+4*32], %1
+    mova [rsp+%4+4*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0 + q0
     paddw               %3, [pw_4]                      ; p3*3 + p2*2 + p1 + p0 + q0 + 4
     psraw               %1, %3, 3                       ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
@@ -218,24 +217,24 @@ SECTION .text
 
 %macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
     punpck%2bw          %1, m2, m0                      ; p7: B->W
-    mova    [rsp+%4+ 8*32], %1
+    mova [rsp+%4+ 8*mmsize*2], %1
     psllw               %3, %1, 3                       ; p7*8
     psubw               %3, %1                          ; p7*7
     punpck%2bw          %1, m3, m0                      ; p6: B->W
-    mova    [rsp+%4+ 9*32], %1
+    mova [rsp+%4+ 9*mmsize*2], %1
     paddw               %3, %1                          ; p7*7 + p6
     paddw               %3, %1                          ; p7*7 + p6*2
     UNPACK          %2, %1, rp5, m0                     ; p5: B->W
-    mova    [rsp+%4+10*32], %1
+    mova [rsp+%4+10*mmsize*2], %1
     paddw               %3, %1                          ; p7*7 + p6*2 + p5
     UNPACK          %2, %1, rp4, m0                     ; p4: B->W
-    mova    [rsp+%4+11*32], %1
+    mova [rsp+%4+11*mmsize*2], %1
     paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4
-    paddw               %3, [rsp+%4+ 0*32]              ; p7*7 + p6*2 + p5 + p4 + p3
-    paddw               %3, [rsp+%4+ 1*32]              ; p7*7 + p6*2 + p5 + .. + p2
-    paddw               %3, [rsp+%4+ 2*32]              ; p7*7 + p6*2 + p5 + .. + p1
-    paddw               %3, [rsp+%4+ 3*32]              ; p7*7 + p6*2 + p5 + .. + p0
-    paddw               %3, [rsp+%4+ 4*32]              ; p7*7 + p6*2 + p5 + .. + p0 + q0
+    paddw               %3, [rsp+%4+ 0*mmsize*2]        ; p7*7 + p6*2 + p5 + p4 + p3
+    paddw               %3, [rsp+%4+ 1*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p2
+    paddw               %3, [rsp+%4+ 2*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p1
+    paddw               %3, [rsp+%4+ 3*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p0
+    paddw               %3, [rsp+%4+ 4*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p0 + q0
     paddw               %3, [pw_8]                      ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
     psraw               %1, %3, 4                       ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
 %endmacro
@@ -335,22 +334,24 @@ SECTION .text
 %endmacro
 
 %macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
-%define P3 rsp +   0 + %1
-%define P2 rsp +  16 + %1
-%define P1 rsp +  32 + %1
-%define P0 rsp +  48 + %1
-%define Q0 rsp +  64 + %1
-%define Q1 rsp +  80 + %1
-%define Q2 rsp +  96 + %1
-%define Q3 rsp + 112 + %1
-%define P7 rsp + 128 + %1
-%define P6 rsp + 144 + %1
-%define P5 rsp + 160 + %1
-%define P4 rsp + 176 + %1
-%define Q4 rsp + 192 + %1
-%define Q5 rsp + 208 + %1
-%define Q6 rsp + 224 + %1
-%define Q7 rsp + 240 + %1
+%define P3 rsp +  0*mmsize + %1
+%define P2 rsp +  1*mmsize + %1
+%define P1 rsp +  2*mmsize + %1
+%define P0 rsp +  3*mmsize + %1
+%define Q0 rsp +  4*mmsize + %1
+%define Q1 rsp +  5*mmsize + %1
+%define Q2 rsp +  6*mmsize + %1
+%define Q3 rsp +  7*mmsize + %1
+%if mmsize == 16
+%define P7 rsp +  8*mmsize + %1
+%define P6 rsp +  9*mmsize + %1
+%define P5 rsp + 10*mmsize + %1
+%define P4 rsp + 11*mmsize + %1
+%define Q4 rsp + 12*mmsize + %1
+%define Q5 rsp + 13*mmsize + %1
+%define Q6 rsp + 14*mmsize + %1
+%define Q7 rsp + 15*mmsize + %1
+%endif
 %endmacro
 
 ; ..............AB -> AAAAAAAABBBBBBBB
@@ -364,14 +365,19 @@ SECTION .text
 %endif
 %endmacro
 
-%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only
+%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only
+%assign %%ext 0
+%if ARCH_X86_32 || mmsize == 8
+%assign %%ext %5
+%endif
+
 %if UNIX64
-cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
 %else
 %if WIN64
-cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3
 %else
-cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3
 %define Ed dword r2m
 %define Id dword r3m
 %endif
@@ -385,18 +391,22 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     lea              mstride3q, [mstrideq*3]
 
 %ifidn %1, h
-%if %2 > 16
+%if %2 != 16
+%if mmsize == 16
 %define movx movh
+%else
+%define movx mova
+%endif
     lea                   dstq, [dstq + 4*strideq - 4]
 %else
 %define movx movu
     lea                   dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
 %endif
-    lea                  dst2q, [dstq + 8*strideq]
 %else
     lea                   dstq, [dstq + 4*mstrideq]
-    lea                  dst2q, [dstq + 8*strideq]
 %endif
+    ; FIXME we shouldn't need two dts registers if mmsize == 8
+    lea                  dst2q, [dstq + 8*strideq]
 
     DEFINE_REAL_P7_TO_Q7
 
@@ -407,11 +417,11 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movx                    m3, [P4]
     movx                    m4, [P3]
     movx                    m5, [P2]
-%if ARCH_X86_64 || %2 != 16
+%if (ARCH_X86_64 && mmsize == 16) || %2 > 16
     movx                    m6, [P1]
 %endif
     movx                    m7, [P0]
-%if ARCH_X86_64
+%ifdef m8
     movx                    m8, [Q0]
     movx                    m9, [Q1]
     movx                   m10, [Q2]
@@ -503,7 +513,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps        [Q5], m6
     movhps        [Q7], m7
     DEFINE_TRANSPOSED_P7_TO_Q7
-%else ; %2 == 44/48/84/88
+%elif %2 > 16 ; %2 == 44/48/84/88
     punpcklbw        m0, m1
     punpcklbw        m2, m3
     punpcklbw        m4, m5
@@ -530,12 +540,31 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova           [Q1],  m5
     mova           [Q2],  m7
     mova           [Q3],  m3
+%else ; %2 == 4 || %2 == 8
+    SBUTTERFLY       bw, 0, 1, 6
+    SBUTTERFLY       bw, 2, 3, 6
+    SBUTTERFLY       bw, 4, 5, 6
+    mova [rsp+4*mmsize], m5
+    mova             m6, [P1]
+    SBUTTERFLY       bw, 6, 7, 5
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    TRANSPOSE4x4W     0, 2, 4, 6, 5
+    mova           [P3], m0
+    mova           [P2], m2
+    mova           [P1], m4
+    mova           [P0], m6
+    mova             m5, [rsp+4*mmsize]
+    TRANSPOSE4x4W     1, 3, 5, 7, 0
+    mova           [Q0], m1
+    mova           [Q1], m3
+    mova           [Q2], m5
+    mova           [Q3], m7
 %endif ; %2
 %endif ; x86-32/64
 %endif ; %1 == h
 
     ; calc fm mask
-%if %2 == 16
+%if %2 == 16 || mmsize == 8
 %if cpuflag(ssse3)
     pxor                m0, m0
 %endif
@@ -553,7 +582,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                m0, [pb_80]
     pxor                m2, m0
     pxor                m3, m0
-%if ARCH_X86_64
+%ifdef m8
 %ifidn %1, v
     mova                m8, [P3]
     mova                m9, [P2]
@@ -614,10 +643,10 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 
     ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
     ; calc flat8in (if not 44_16) and hev masks
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     mova                m6, [pb_81]                     ; [1 1 1 1 ...] ^ 0x80
     ABSSUB_GT           m2, rp3, rp0, m6, m5            ; abs(p3 - p0) <= 1
-%if ARCH_X86_64
+%ifdef m8
     mova                m8, [pb_80]
 %define rb80 m8
 %else
@@ -626,7 +655,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     ABSSUB_GT           m1, rp2, rp0, m6, m5, rb80      ; abs(p2 - p0) <= 1
     por                 m2, m1
     ABSSUB              m4, rp1, rp0, m5                ; abs(p1 - p0)
-%if %2 == 16
+%if %2 <= 16
 %if cpuflag(ssse3)
     pxor                m0, m0
 %endif
@@ -656,8 +685,15 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 %endif
 %else
     mova                m6, [pb_80]
+%if %2 == 44
     movd                m7, Hd
     SPLATB_MIX          m7
+%else
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m7, H, m0                       ; H H H H ...
+%endif
     pxor                m7, m6
     ABSSUB              m4, rp1, rp0, m1                ; abs(p1 - p0)
     pxor                m4, m6
@@ -671,7 +707,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 %if %2 == 16
     ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
     ; calc flat8out mask
-%if ARCH_X86_64
+%ifdef m8
     mova                m8, [P7]
     mova                m9, [P6]
 %define rp7 m8
@@ -683,7 +719,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     ABSSUB_GT           m1, rp7, rp0, m6, m5            ; abs(p7 - p0) <= 1
     ABSSUB_GT           m7, rp6, rp0, m6, m5            ; abs(p6 - p0) <= 1
     por                 m1, m7
-%if ARCH_X86_64
+%ifdef m8
     mova                m8, [P5]
     mova                m9, [P4]
 %define rp5 m8
@@ -696,7 +732,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     por                 m1, m7
     ABSSUB_GT           m7, rp4, rp0, m6, m5            ; abs(p4 - p0) <= 1
     por                 m1, m7
-%if ARCH_X86_64
+%ifdef m8
     mova                m14, [Q4]
     mova                m15, [Q5]
 %define rq4 m14
@@ -709,7 +745,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     por                 m1, m7
     ABSSUB_GT           m7, rq5, rq0, m6, m5            ; abs(q5 - q0) <= 1
     por                 m1, m7
-%if ARCH_X86_64
+%ifdef m8
     mova                m14, [Q6]
     mova                m15, [Q7]
 %define rq6 m14
@@ -739,7 +775,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 
     ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
     ; filter2()
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     mova                m6, [pb_80]                     ; already in m6 if 44_16
     SCRATCH              2, 15, rsp+%3+%4
 %if %2 == 16
@@ -757,7 +793,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     paddsb              m4, m2                          ; 3*(q0 - p0) + (p1 - q1)
     paddsb              m6, m4, [pb_4]                  ; m6: f1 = clip(f + 4, 127)
     paddsb              m4, [pb_3]                      ; m4: f2 = clip(f + 3, 127)
-%if ARCH_X86_64
+%ifdef m8
     mova                m14, [pb_10]                    ; will be reused in filter4()
 %define rb10 m14
 %else
@@ -766,8 +802,8 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SRSHIFT3B_2X        m6, m4, rb10, m7                ; f1 and f2 sign byte shift by 3
     SIGN_SUB            m7, rq0, m6, m5                 ; m7 = q0 - f1
     SIGN_ADD            m1, rp0, m4, m5                 ; m1 = p0 + f2
-%if %2 != 44
-%if ARCH_X86_64
+%if %2 != 44 && %2 != 4
+%ifdef m8
     pandn               m6, m15, m3                     ;  ~mask(in) & mask(fm)
 %else
     mova                m6, [rsp+%3+%4]
@@ -788,8 +824,8 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     paddsb              m6, m2, [pb_4]                  ; m6:  f1 = clip(f + 4, 127)
     paddsb              m2, [pb_3]                      ; m2: f2 = clip(f + 3, 127)
     SRSHIFT3B_2X        m6, m2, rb10, m4                ; f1 and f2 sign byte shift by 3
-%if %2 != 44
-%if ARCH_X86_64
+%if %2 != 44 && %2 != 4
+%ifdef m8
     pandn               m5, m15, m3                     ;               ~mask(in) & mask(fm)
 %else
     mova                m5, [rsp+%3+%4]
@@ -816,26 +852,26 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                [P1], m1
     mova                [Q1], m4
 
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     UNSCRATCH            2, 15, rsp+%3+%4
 %endif
 
     ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
     ; filter6()
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     pxor                m0, m0
-%if %2 > 16
+%if %2 != 16
     pand                m3, m2
 %else
     pand                m2, m3                          ;               mask(fm) & mask(in)
-%if ARCH_X86_64
+%ifdef m8
     pandn               m3, m8, m2                      ; ~mask(out) & (mask(fm) & mask(in))
 %else
     mova                m3, [rsp+%3+%4+16]
     pandn               m3, m2
 %endif
 %endif
-%if ARCH_X86_64
+%ifdef m8
     mova               m14, [P3]
     mova                m9, [Q3]
 %define rp3 m14
@@ -883,7 +919,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
     mova            m2, [P7]
     mova            m3, [P6]
-%if ARCH_X86_64
+%ifdef m8
     mova            m8, [P5]
     mova            m9, [P4]
 %define rp5 m8
@@ -1009,7 +1045,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps [Q5],  m6
     movhps [Q7],  m7
 %endif
-%elif %2 == 44
+%elif %2 == 44 || %2 == 4
     SWAP 0, 1   ; m0 = p1
     SWAP 1, 7   ; m1 = p0
     SWAP 2, 5   ; m2 = q0
@@ -1019,6 +1055,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SBUTTERFLY  bw, 2, 3, 4
     SBUTTERFLY  wd, 0, 2, 4
     SBUTTERFLY  wd, 1, 3, 4
+%if mmsize == 16
     movd  [P7], m0
     movd  [P3], m2
     movd  [Q0], m1
@@ -1048,6 +1085,20 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movd  [Q3], m1
     movd  [Q7], m3
 %else
+    movd  [P7], m0
+    movd  [P5], m2
+    movd  [P3], m1
+    movd  [P1], m3
+    psrlq   m0, 32
+    psrlq   m2, 32
+    psrlq   m1, 32
+    psrlq   m3, 32
+    movd  [P6], m0
+    movd  [P4], m2
+    movd  [P2], m1
+    movd  [P0], m3
+%endif
+%else
     ; the following code do a transpose of 8 full lines to 16 half
     ; lines (high part). It is inlined to avoid the need of a staging area
     mova                    m0, [P3]
@@ -1056,12 +1107,12 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                    m3, [P0]
     mova                    m4, [Q0]
     mova                    m5, [Q1]
-%if ARCH_X86_64
+%ifdef m8
     mova                    m6, [Q2]
 %endif
     mova                    m7, [Q3]
     DEFINE_REAL_P7_TO_Q7
-%if ARCH_X86_64
+%ifdef m8
     SBUTTERFLY  bw,  0,  1, 8
     SBUTTERFLY  bw,  2,  3, 8
     SBUTTERFLY  bw,  4,  5, 8
@@ -1076,27 +1127,32 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SBUTTERFLY  dq,  3,  7, 8
 %else
     SBUTTERFLY  bw,  0,  1, 6
-    mova  [rsp+64], m1
-    mova        m6, [rsp+96]
+    mova [rsp+mmsize*4], m1
+    mova        m6, [rsp+mmsize*6]
     SBUTTERFLY  bw,  2,  3, 1
     SBUTTERFLY  bw,  4,  5, 1
     SBUTTERFLY  bw,  6,  7, 1
     SBUTTERFLY  wd,  0,  2, 1
-    mova  [rsp+96], m2
-    mova        m1, [rsp+64]
+    mova [rsp+mmsize*6], m2
+    mova        m1, [rsp+mmsize*4]
     SBUTTERFLY  wd,  1,  3, 2
     SBUTTERFLY  wd,  4,  6, 2
     SBUTTERFLY  wd,  5,  7, 2
     SBUTTERFLY  dq,  0,  4, 2
     SBUTTERFLY  dq,  1,  5, 2
+%if mmsize == 16
     movh      [Q0], m1
     movhps    [Q1], m1
-    mova        m2, [rsp+96]
+%else
+    mova      [P3], m1
+%endif
+    mova        m2, [rsp+mmsize*6]
     SBUTTERFLY  dq,  2,  6, 1
     SBUTTERFLY  dq,  3,  7, 1
 %endif
     SWAP         3, 6
     SWAP         1, 4
+%if mmsize == 16
     movh      [P7], m0
     movhps    [P6], m0
     movh      [P5], m1
@@ -1105,7 +1161,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps    [P2], m2
     movh      [P1], m3
     movhps    [P0], m3
-%if ARCH_X86_64
+%ifdef m8
     movh      [Q0], m4
     movhps    [Q1], m4
 %endif
@@ -1115,6 +1171,15 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps    [Q5], m6
     movh      [Q6], m7
     movhps    [Q7], m7
+%else
+    mova      [P7], m0
+    mova      [P6], m1
+    mova      [P5], m2
+    mova      [P4], m3
+    mova      [P2], m5
+    mova      [P1], m6
+    mova      [P0], m7
+%endif
 %endif
 %endif
 
@@ -1138,3 +1203,9 @@ LPF_16_VH_ALL_OPTS 44,   0, 128,  0
 LPF_16_VH_ALL_OPTS 48, 256, 128, 16
 LPF_16_VH_ALL_OPTS 84, 256, 128, 16
 LPF_16_VH_ALL_OPTS 88, 256, 128, 16
+
+INIT_MMX mmxext
+LOOPFILTER v, 4,   0,  0, 0
+LOOPFILTER h, 4,   0, 64, 0
+LOOPFILTER v, 8, 128,  0, 8
+LOOPFILTER h, 8, 128, 64, 8
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
new file mode 100644
index 0000000000..c0888170c9
--- /dev/null
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_511: times 16 dw 511
+pw_2047: times 16 dw 2047
+pw_16384: times 16 dw 16384
+pw_m512: times 16 dw -512
+pw_m2048: times 16 dw -2048
+
+cextern pw_1
+cextern pw_3
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_256
+cextern pw_1023
+cextern pw_4095
+cextern pw_m1
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; calculate p or q portion of flat8out
+%macro FLAT8OUT_HALF 0
+    psubw               m4, m0                      ; q4-q0
+    psubw               m5, m0                      ; q5-q0
+    psubw               m6, m0                      ; q6-q0
+    psubw               m7, m0                      ; q7-q0
+    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
+    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
+    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
+    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
+    por                 m5, m4
+    por                 m7, m6
+    por                 m7, m5                      ; !flat8out, q portion
+%endmacro
+
+; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
+%macro FLAT8IN_HALF 1
+%if %1 > 4
+    psubw               m4, m3, m0                  ; q3-q0
+    psubw               m5, m2, m0                  ; q2-q0
+    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
+%endif
+    psubw               m3, m2                      ; q3-q2
+    psubw               m2, m1                      ; q2-q1
+    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
+    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
+    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
+%if %1 > 4
+    por                 m4, m5
+%endif
+    por                 m2, m3
+    psubw               m3, m1, m0                  ; q1-q0
+    ABS1                m3, m5                      ; abs(q1-q0)
+%if %1 > 4
+    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
+%endif
+    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
+    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
+%if %1 > 4
+    por                 m4, m6
+%endif
+    por                 m2, m3
+%endmacro
+
+; one step in filter_14/filter_6
+;
+; take sum $reg, downshift, apply mask and write into dst
+;
+; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
+; step's sum $reg. This is omitted for the last row in each filter.
+;
+; if dont_store is set, don't write the result into memory, instead keep the
+; values in register so we can write it out later
+%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
+                                      ; src/sub1, sub2, add1, add2, dont_store
+    psrlw               %1, %2, %4
+    psubw               %1, %6                      ; abs->delta
+%ifnidn %7, ""
+    psubw               %2, %6
+    psubw               %2, %7
+    paddw               %2, %8
+    paddw               %2, %9
+%endif
+    pand                %1, reg_%3                  ; apply mask
+%if %10 == 1
+    paddw               %6, %1                      ; delta->abs
+%else
+    paddw               %1, %6                      ; delta->abs
+    mova              [%5], %1
+%endif
+%endmacro
+
+; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
+
+%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
+
+%if ARCH_X86_64
+%if %2 == 16
+%assign %%num_xmm_regs 16
+%elif %2 == 8
+%assign %%num_xmm_regs 15
+%else ; %2 == 4
+%assign %%num_xmm_regs 14
+%endif ; %2
+%assign %%bak_mem 0
+%else ; ARCH_X86_32
+%assign %%num_xmm_regs 8
+%if %2 == 16
+%assign %%bak_mem 7
+%elif %2 == 8
+%assign %%bak_mem 6
+%else ; %2 == 4
+%assign %%bak_mem 5
+%endif ; %2
+%endif ; ARCH_X86_64/32
+
+%if %2 == 16
+%ifidn %1, v
+%assign %%num_gpr_regs 6
+%else ; %1 == h
+%assign %%num_gpr_regs 5
+%endif ; %1
+%assign %%wd_mem 6
+%else ; %2 == 8/4
+%assign %%num_gpr_regs 5
+%if ARCH_X86_32 && %2 == 8
+%assign %%wd_mem 2
+%else ; ARCH_X86_64 || %2 == 4
+%assign %%wd_mem 0
+%endif ; ARCH_X86_64/32 etc.
+%endif ; %2
+
+%ifidn %1, v
+%assign %%tsp_mem 0
+%elif %2 == 16 ; && %1 == h
+%assign %%tsp_mem 16
+%else ; %1 == h && %1 == 8/4
+%assign %%tsp_mem 8
+%endif ; %1/%2
+
+%assign %%off %%wd_mem
+%assign %%tspoff %%bak_mem+%%wd_mem
+%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
+
+%if %3 == 10
+%define %%maxsgn 511
+%define %%minsgn m512
+%define %%maxusgn 1023
+%define %%maxf 4
+%else ; %3 == 12
+%define %%maxsgn 2047
+%define %%minsgn m2048
+%define %%maxusgn 4095
+%define %%maxf 16
+%endif ; %3
+
+cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
+    ; prepare E, I and H masks
+    shl                 Ed, %3-8
+    shl                 Id, %3-8
+    shl                 Hd, %3-8
+%if cpuflag(ssse3)
+    mova                m0, [pw_256]
+%endif
+    movd                m1, Ed
+    movd                m2, Id
+    movd                m3, Hd
+%if cpuflag(ssse3)
+    pshufb              m1, m0                      ; E << (bit_depth - 8)
+    pshufb              m2, m0                      ; I << (bit_depth - 8)
+    pshufb              m3, m0                      ; H << (bit_depth - 8)
+%else
+    punpcklwd           m1, m1
+    punpcklwd           m2, m2
+    punpcklwd           m3, m3
+    pshufd              m1, m1, q0000
+    pshufd              m2, m2, q0000
+    pshufd              m3, m3, q0000
+%endif
+    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
+    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
+    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
+%if %2 > 4
+    PRELOAD                 11, pw_ %+ %%maxf, F
+%endif
+
+    ; set up variables to load data
+%ifidn %1, v
+    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
+    lea           stride3q, [strideq*3]
+    neg            strideq
+%if %2 == 16
+    lea              dst0q, [dst8q+strideq*8]
+%else
+    lea              dst4q, [dst8q+strideq*4]
+%endif
+    neg            strideq
+%if %2 == 16
+    lea             dst12q, [dst8q+strideq*4]
+    lea              dst4q, [dst0q+strideq*4]
+%endif
+
+%if %2 == 16
+%define %%p7 dst0q
+%define %%p6 dst0q+strideq
+%define %%p5 dst0q+strideq*2
+%define %%p4 dst0q+stride3q
+%endif
+%define %%p3 dst4q
+%define %%p2 dst4q+strideq
+%define %%p1 dst4q+strideq*2
+%define %%p0 dst4q+stride3q
+%define %%q0 dst8q
+%define %%q1 dst8q+strideq
+%define %%q2 dst8q+strideq*2
+%define %%q3 dst8q+stride3q
+%if %2 == 16
+%define %%q4 dst12q
+%define %%q5 dst12q+strideq
+%define %%q6 dst12q+strideq*2
+%define %%q7 dst12q+stride3q
+%endif
+%else ; %1 == h
+    DEFINE_ARGS dst0, stride, stride3, dst4
+    lea           stride3q, [strideq*3]
+    lea              dst4q, [dst0q+strideq*4]
+
+%define %%p3 rsp+(%%tspoff+0)*mmsize
+%define %%p2 rsp+(%%tspoff+1)*mmsize
+%define %%p1 rsp+(%%tspoff+2)*mmsize
+%define %%p0 rsp+(%%tspoff+3)*mmsize
+%define %%q0 rsp+(%%tspoff+4)*mmsize
+%define %%q1 rsp+(%%tspoff+5)*mmsize
+%define %%q2 rsp+(%%tspoff+6)*mmsize
+%define %%q3 rsp+(%%tspoff+7)*mmsize
+
+%if %2 < 16
+    movu                m0, [dst0q+strideq*0-8]
+    movu                m1, [dst0q+strideq*1-8]
+    movu                m2, [dst0q+strideq*2-8]
+    movu                m3, [dst0q+stride3q -8]
+    movu                m4, [dst4q+strideq*0-8]
+    movu                m5, [dst4q+strideq*1-8]
+    movu                m6, [dst4q+strideq*2-8]
+    movu                m7, [dst4q+stride3q -8]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
+%endif
+
+    mova            [%%p3], m0
+    mova            [%%p2], m1
+    mova            [%%p1], m2
+    mova            [%%p0], m3
+%if ARCH_X86_64
+    mova            [%%q0], m4
+%endif
+    mova            [%%q1], m5
+    mova            [%%q2], m6
+    mova            [%%q3], m7
+
+    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
+    ; order here accordingly
+%else ; %2 == 16
+
+%define %%p7 rsp+(%%tspoff+ 8)*mmsize
+%define %%p6 rsp+(%%tspoff+ 9)*mmsize
+%define %%p5 rsp+(%%tspoff+10)*mmsize
+%define %%p4 rsp+(%%tspoff+11)*mmsize
+%define %%q4 rsp+(%%tspoff+12)*mmsize
+%define %%q5 rsp+(%%tspoff+13)*mmsize
+%define %%q6 rsp+(%%tspoff+14)*mmsize
+%define %%q7 rsp+(%%tspoff+15)*mmsize
+
+    mova                m0, [dst0q+strideq*0-16]
+    mova                m1, [dst0q+strideq*1-16]
+    mova                m2, [dst0q+strideq*2-16]
+    mova                m3, [dst0q+stride3q -16]
+    mova                m4, [dst4q+strideq*0-16]
+    mova                m5, [dst4q+strideq*1-16]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2-16]
+%endif
+    mova                m7, [dst4q+stride3q -16]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
+%endif
+
+    mova            [%%p7], m0
+    mova            [%%p6], m1
+    mova            [%%p5], m2
+    mova            [%%p4], m3
+%if ARCH_X86_64
+    mova            [%%p3], m4
+%endif
+    mova            [%%p2], m5
+    mova            [%%p1], m6
+    mova            [%%p0], m7
+
+    mova                m0, [dst0q+strideq*0]
+    mova                m1, [dst0q+strideq*1]
+    mova                m2, [dst0q+strideq*2]
+    mova                m3, [dst0q+stride3q ]
+    mova                m4, [dst4q+strideq*0]
+    mova                m5, [dst4q+strideq*1]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2]
+%endif
+    mova                m7, [dst4q+stride3q ]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
+%endif
+
+    mova            [%%q0], m0
+    mova            [%%q1], m1
+    mova            [%%q2], m2
+    mova            [%%q3], m3
+%if ARCH_X86_64
+    mova            [%%q4], m4
+%endif
+    mova            [%%q5], m5
+    mova            [%%q6], m6
+    mova            [%%q7], m7
+
+    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
+    ; order here accordingly
+%endif ; %2
+%endif ; %1
+
+    ; load q0|q4-7 data
+    mova                m0, [%%q0]
+%if %2 == 16
+    mova                m4, [%%q4]
+    mova                m5, [%%q5]
+    mova                m6, [%%q6]
+    mova                m7, [%%q7]
+
+    ; flat8out q portion
+    FLAT8OUT_HALF
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; load q1-3 data
+    mova                m1, [%%q1]
+    mova                m2, [%%q2]
+    mova                m3, [%%q3]
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flatout[q]
+    ; m12-14=free
+    ; m0-3=q0-q3
+    ; m4-7=free
+
+    ; flat8in|fm|hev q portion
+    FLAT8IN_HALF        %2
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+%if %2 > 4
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; m2=!fm[q]
+    ; m0,1=q0-q1
+    ; m2-7=free
+    ; m12=free
+
+    ; load p0-1
+    mova                m3, [%%p0]
+    mova                m4, [%%p1]
+
+    ; fm mb_edge portion
+    psubw               m5, m3, m0                  ; q0-p0
+    psubw               m6, m4, m1                  ; q1-p1
+%if ARCH_X86_64
+    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
+%else
+    ABS1                m5, m7                      ; abs(q0-p0)
+    ABS1                m6, m7                      ; abs(q1-p1)
+%endif
+    paddw               m5, m5
+    psraw               m6, 1
+    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
+    pcmpgtw             m6, reg_E
+    por                 m2, m6
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m3-4=q0-1
+    ; m0-2/5-7=free
+
+    ; load p4-7 data
+    SWAP                 3, 0                       ; p0
+    SWAP                 4, 1                       ; p1
+%if %2 == 16
+    mova                m7, [%%p7]
+    mova                m6, [%%p6]
+    mova                m5, [%%p5]
+    mova                m4, [%%p4]
+
+    ; flat8out p portion
+    FLAT8OUT_HALF
+    por                 m7, reg_F8O
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m0=p0
+    ; m1-7=free
+
+    ; load p2-3 data
+    mova                m2, [%%p2]
+    mova                m3, [%%p3]
+
+    ; flat8in|fm|hev p portion
+    FLAT8IN_HALF        %2
+    por                 m7, reg_HEV
+%if %2 > 4
+    por                 m4, reg_F8I
+%endif
+    por                 m2, reg_FM
+%if %2 > 4
+    por                 m4, m2                      ; !flat8|!fm
+%if %2 == 16
+    por                 m5, m4, reg_F8O             ; !flat16|!fm
+    pandn               m2, m4                      ; filter4_mask
+    pandn               m4, m5                      ; filter8_mask
+    pxor                m5, [pw_m1]                 ; filter16_mask
+    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
+%else
+    pandn               m2, m4                      ; filter4_mask
+    pxor                m4, [pw_m1]                 ; filter8_mask
+%endif
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
+%else
+    pxor                m2, [pw_m1]                 ; filter4_mask
+%endif
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
+
+    ; r9[m15]=filter16_mask
+    ; r10[m13]=hev
+    ; r11[m14]=filter8_mask
+    ; r12[m12]=filter4_mask
+    ; m0,1=p0-p1
+    ; m2-7=free
+    ; m8-11=free
+
+%if %2 > 4
+%if %2 == 16
+    ; filter_14
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m6, [%%p5]
+    mova                m7, [%%p4]
+    PRELOAD              8, %%p3, P3
+    PRELOAD              9, %%p2, P2
+%endif
+    PRELOAD             10, %%q0, Q0
+    PRELOAD             11, %%q1, Q1
+%if %2 == 16
+    psllw               m4, m2, 3
+    paddw               m5, m3, m3
+    paddw               m4, m6
+    paddw               m5, m7
+    paddw               m4, reg_P3
+    paddw               m5, reg_P2
+    paddw               m4, m1
+    paddw               m5, m0
+    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
+    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
+    paddw               m4, [pw_8]
+    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
+
+    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
+    ; at the end of the filter
+
+    mova    [rsp+0*mmsize], m3
+    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
+%endif
+    mova                m3, [%%q2]
+%if %2 == 16
+    mova    [rsp+1*mmsize], m6
+    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
+%endif
+    mova                m6, [%%q3]
+%if %2 == 16
+    mova    [rsp+2*mmsize], m7
+    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
+    mova                m7, [%%q4]
+%if ARCH_X86_64
+    mova    [rsp+3*mmsize], reg_P3
+%else
+    mova                m4, reg_P3
+    mova    [rsp+3*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
+    PRELOAD              8, %%q5, Q5
+%if ARCH_X86_64
+    mova    [rsp+4*mmsize], reg_P2
+%else
+    mova                m4, reg_P2
+    mova    [rsp+4*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
+    PRELOAD              9, %%q6, Q6
+    mova    [rsp+5*mmsize], m1
+    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
+    mova                m1, [%%q7]
+    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
+
+    mova                m7, [%%p1]
+%else
+    SWAP                 1, 7
+%endif
+
+    mova                m2, [%%p3]
+    mova                m1, [%%p2]
+
+    ; reg_Q0-1 (m10-m11)
+    ; m0=p0
+    ; m1=p2
+    ; m2=p3
+    ; m3=q2
+    ; m4-5=free
+    ; m6=q3
+    ; m7=p1
+    ; m8-9 unused
+
+    ; filter_6
+    psllw               m4, m2, 2
+    paddw               m5, m1, m1
+    paddw               m4, m7
+    psubw               m5, m2
+    paddw               m4, m0
+    paddw               m5, reg_Q0
+    paddw               m4, [pw_4]
+    paddw               m5, m4
+
+%if ARCH_X86_64
+    mova                m8, m1
+    mova                m9, m7
+%else
+    mova    [rsp+0*mmsize], m1
+    mova    [rsp+1*mmsize], m7
+%endif
+%ifidn %1, v
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
+    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
+%if ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
+
+    UNSCRATCH            2, 10, %%q0
+    UNSCRATCH            6, 11, %%q1
+%else
+    SWAP                 1, 7
+    mova                m2, [%%q0]
+    mova                m6, [%%q1]
+%endif
+    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
+
+    ; m0=p0
+    ; m1=p2
+    ; m2=q0
+    ; m3=hev_mask
+    ; m4-5=free
+    ; m6=q1
+    ; m7=p1
+
+    ; filter_4
+    psubw               m4, m7, m6              ; p1-q1
+    psubw               m5, m2, m0              ; q0-p0
+    pand                m4, m3
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
+    paddw               m4, m5
+    paddw               m5, m5
+    paddw               m4, m5                  ; 3*(q0-p0)+f
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
+    pand                m4, reg_F4M
+    paddw               m5, m4, [pw_4]
+    paddw               m4, [pw_3]
+    pminsw              m5, [pw_ %+ %%maxsgn]
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
+    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
+    psubw               m2, m5                  ; q0-f1
+    paddw               m0, m4                  ; p0+f2
+    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
+    pxor                m4, m4
+    mova                m5, [pw_ %+ %%maxusgn]
+    pmaxsw              m2, m4
+    pmaxsw              m0, m4
+    pminsw              m2, m5
+    pminsw              m0, m5
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
+%else
+    paddw               m3, [pw_1]
+    psraw               m3, 1
+%endif
+    paddw               m7, m3                  ; p1+f
+    psubw               m6, m3                  ; q1-f
+    pmaxsw              m7, m4
+    pmaxsw              m6, m4
+    pminsw              m7, m5
+    pminsw              m6, m5
+
+    ; store
+%ifidn %1, v
+    mova            [%%p1], m7
+    mova            [%%p0], m0
+    mova            [%%q0], m2
+    mova            [%%q1], m6
+%else ; %1 == h
+%if %2 == 4
+    TRANSPOSE4x4W        7, 0, 2, 6, 1
+    movh   [dst0q+strideq*0-4], m7
+    movhps [dst0q+strideq*1-4], m7
+    movh   [dst0q+strideq*2-4], m0
+    movhps [dst0q+stride3q -4], m0
+    movh   [dst4q+strideq*0-4], m2
+    movhps [dst4q+strideq*1-4], m2
+    movh   [dst4q+strideq*2-4], m6
+    movhps [dst4q+stride3q -4], m6
+%elif %2 == 8
+    mova                m3, [%%p3]
+    mova                m4, [%%q2]
+    mova                m5, [%%q3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
+%else
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
+    mova                m2, [%%q0]
+%endif
+
+    movu [dst0q+strideq*0-8], m3
+    movu [dst0q+strideq*1-8], m1
+    movu [dst0q+strideq*2-8], m7
+    movu [dst0q+stride3q -8], m0
+    movu [dst4q+strideq*0-8], m2
+    movu [dst4q+strideq*1-8], m6
+    movu [dst4q+strideq*2-8], m4
+    movu [dst4q+stride3q -8], m5
+%else ; %2 == 16
+    SCRATCH              2, 8, %%q0
+    SCRATCH              6, 9, %%q1
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m4, [%%p5]
+    mova                m5, [%%p4]
+    mova                m6, [%%p3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
+%else
+    mova            [%%p1], m7
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
+%endif
+
+    mova [dst0q+strideq*0-16], m2
+    mova [dst0q+strideq*1-16], m3
+    mova [dst0q+strideq*2-16], m4
+    mova [dst0q+stride3q -16], m5
+%if ARCH_X86_64
+    mova [dst4q+strideq*0-16], m6
+%endif
+    mova [dst4q+strideq*1-16], m1
+    mova [dst4q+strideq*2-16], m7
+    mova [dst4q+stride3q -16], m0
+
+    UNSCRATCH            2, 8, %%q0
+    UNSCRATCH            6, 9, %%q1
+    mova                m0, [%%q2]
+    mova                m1, [%%q3]
+    mova                m3, [%%q4]
+    mova                m4, [%%q5]
+%if ARCH_X86_64
+    mova                m5, [%%q6]
+%endif
+    mova                m7, [%%q7]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
+%else
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
+%endif
+
+    mova [dst0q+strideq*0], m2
+    mova [dst0q+strideq*1], m6
+    mova [dst0q+strideq*2], m0
+    mova [dst0q+stride3q ], m1
+%if ARCH_X86_64
+    mova [dst4q+strideq*0], m3
+%endif
+    mova [dst4q+strideq*1], m4
+    mova [dst4q+strideq*2], m5
+    mova [dst4q+stride3q ], m7
+%endif ; %2
+%endif ; %1
+    RET
+%endmacro
+
+%macro LOOP_FILTER_CPUSETS 3
+INIT_XMM sse2
+LOOP_FILTER %1, %2, %3
+INIT_XMM ssse3
+LOOP_FILTER %1, %2, %3
+INIT_XMM avx
+LOOP_FILTER %1, %2, %3
+%endmacro
+
+%macro LOOP_FILTER_WDSETS 2
+LOOP_FILTER_CPUSETS %1,  4, %2
+LOOP_FILTER_CPUSETS %1,  8, %2
+LOOP_FILTER_CPUSETS %1, 16, %2
+%endmacro
+
+LOOP_FILTER_WDSETS h, 10
+LOOP_FILTER_WDSETS v, 10
+LOOP_FILTER_WDSETS h, 12
+LOOP_FILTER_WDSETS v, 12
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index c9701aea18..f64161b2c2 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -3,20 +3,20 @@
 ;*
 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -45,6 +45,13 @@ times 8 dw %7
 times 8 dw %8
 %endmacro
 
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
 %macro FILTER 1
 const filters_%1 ; smooth
                     F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
@@ -102,12 +109,15 @@ FILTER ssse3
 %define F8_TAPS F8_SSE2_TAPS
 ; int16_t ff_filters_sse2[3][15][8][8]
 FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
 
 SECTION .text
 
 %macro filter_sse2_h_fn 1
 %assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
     pxor        m5, m5
     mova        m6, [pw_64]
     mova        m7, [filteryq+  0]
@@ -192,7 +202,7 @@ filter_sse2_h_fn avg
 
 %macro filter_h_fn 1
 %assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
     mova        m6, [pw_256]
     mova        m7, [filteryq+ 0]
 %if ARCH_X86_64 && mmsize > 8
@@ -253,7 +263,7 @@ filter_h_fn avg
 %if ARCH_X86_64
 %macro filter_hx2_fn 1
 %assign %%px mmsize
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
     mova       m13, [pw_256]
     mova        m8, [filteryq+ 0]
     mova        m9, [filteryq+32]
@@ -315,9 +325,9 @@ filter_hx2_fn avg
 %macro filter_sse2_v_fn 1
 %assign %%px mmsize/2
 %if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
 %else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
     mov   filteryq, r5mp
 %define hd r4mp
 %endif
@@ -413,9 +423,9 @@ filter_sse2_v_fn avg
 %macro filter_v_fn 1
 %assign %%px mmsize/2
 %if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
 %else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
     mov   filteryq, r5mp
 %define hd r4mp
 %endif
@@ -486,7 +496,7 @@ filter_v_fn avg
 
 %macro filter_vx2_fn 1
 %assign %%px mmsize
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
     mova       m13, [pw_256]
     lea  sstride3q, [sstrideq*3]
     lea      src4q, [srcq+sstrideq]
@@ -552,7 +562,7 @@ filter_vx2_fn avg
 
 %endif ; ARCH_X86_64
 
-%macro fpel_fn 6
+%macro fpel_fn 6-8 0, 4
 %if %2 == 4
 %define %%srcfn movh
 %define %%dstfn movh
@@ -561,29 +571,57 @@ filter_vx2_fn avg
 %define %%dstfn mova
 %endif
 
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
 %if %2 <= mmsize
-cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
     lea  sstride3q, [sstrideq*3]
     lea  dstride3q, [dstrideq*3]
 %else
-cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
 %endif
 .loop:
     %%srcfn     m0, [srcq]
     %%srcfn     m1, [srcq+s%3]
     %%srcfn     m2, [srcq+s%4]
     %%srcfn     m3, [srcq+s%5]
+%if %2/mmsize == 8
+    %%srcfn     m4, [srcq+mmsize*4]
+    %%srcfn     m5, [srcq+mmsize*5]
+    %%srcfn     m6, [srcq+mmsize*6]
+    %%srcfn     m7, [srcq+mmsize*7]
+%endif
     lea       srcq, [srcq+sstrideq*%6]
 %ifidn %1, avg
-    pavgb       m0, [dstq]
-    pavgb       m1, [dstq+d%3]
-    pavgb       m2, [dstq+d%4]
-    pavgb       m3, [dstq+d%5]
+    %%pavg      m0, [dstq]
+    %%pavg      m1, [dstq+d%3]
+    %%pavg      m2, [dstq+d%4]
+    %%pavg      m3, [dstq+d%5]
+%if %2/mmsize == 8
+    %%pavg      m4, [dstq+mmsize*4]
+    %%pavg      m5, [dstq+mmsize*5]
+    %%pavg      m6, [dstq+mmsize*6]
+    %%pavg      m7, [dstq+mmsize*7]
+%endif
 %endif
     %%dstfn [dstq], m0
     %%dstfn [dstq+d%3], m1
     %%dstfn [dstq+d%4], m2
     %%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+    %%dstfn [dstq+mmsize*4], m4
+    %%dstfn [dstq+mmsize*5], m5
+    %%dstfn [dstq+mmsize*6], m6
+    %%dstfn [dstq+mmsize*7], m7
+%endif
     lea       dstq, [dstq+dstrideq*%6]
     sub         hd, %6
     jnz .loop
@@ -598,23 +636,38 @@ INIT_MMX mmx
 fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
 fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
 INIT_MMX mmxext
-fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
-fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
+fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
 INIT_XMM sse
 fpel_fn put, 16, strideq, strideq*2, stride3q, 4
 fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
 fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
 INIT_XMM sse2
-fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
-fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
 INIT_YMM avx
 fpel_fn put, 32, strideq, strideq*2, stride3q, 4
 fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
 %endif
 %undef s16
 %undef d16
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
new file mode 100644
index 0000000000..9a462eaf80
--- /dev/null
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_64: times 8 dd 64
+
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+%macro filter_h4_fn 1-2 12
+cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movh        m0, [srcq-6]
+    movh        m1, [srcq-4]
+    movh        m2, [srcq-2]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+2]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+32]
+%endif
+    movu        m1, [srcq+4]
+    movu        m3, [srcq+6]
+    paddd       m0, m2
+    movu        m2, [srcq+8]
+    add       srcq, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h4_fn put
+filter_h4_fn avg
+
+%macro filter_h_fn 1-2 12
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movu        m0, [srcq-6]
+    movu        m1, [srcq-4]
+    movu        m2, [srcq-2]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+2]
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+    pmaddwd     m4, m9
+%else
+    pmaddwd     m2, [filteryq+32]
+    pmaddwd     m3, [filteryq+32]
+    pmaddwd     m4, [filteryq+64]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    paddd       m0, m4
+    movu        m2, [srcq+4]
+    movu        m3, [srcq+6]
+    movu        m4, [srcq+8]
+    add       srcq, sstrideq
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m9
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m2, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+    pmaddwd     m4, [filteryq+96]
+%endif
+    paddd       m1, m2
+    paddd       m0, m3
+    paddd       m1, m4
+    paddd       m0, m6
+    paddd       m1, m6
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+    packusdw    m1, m1
+%else
+    packssdw    m0, m0
+    packssdw    m1, m1
+%endif
+    punpcklwd   m0, m1
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h_fn put
+filter_h_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_h_fn put
+filter_h_fn avg
+%endif
+
+%macro filter_v4_fn 1-2 12
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movh        m4, [src4q]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+%endif
+    movh        m1, [src4q+sstrideq]
+    movh        m3, [src4q+sstrideq*2]
+    paddd       m0, m2
+    movh        m2, [src4q+sstride3q]
+    add      src4q, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m3, [filteryq+ 96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v4_fn put
+filter_v4_fn avg
+
+%macro filter_v_fn 1-2 13
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m12, m12
+%endif
+%if ARCH_X86_64
+    mova       m11, [pd_64]
+%endif
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movu        m4, [src4q]
+    SBUTTERFLY  wd, 0, 1, 6
+    SBUTTERFLY  wd, 2, 3, 6
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+    pmaddwd     m3, [filteryq+ 32]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    movu        m2, [src4q+sstrideq]
+    movu        m3, [src4q+sstrideq*2]
+    SBUTTERFLY  wd, 4, 2, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m2, m9
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m2, [filteryq+ 64]
+%endif
+    paddd       m0, m4
+    paddd       m1, m2
+    movu        m4, [src4q+sstride3q]
+    add      src4q, sstrideq
+    SBUTTERFLY  wd, 3, 4, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m3, [filteryq+ 96]
+    pmaddwd     m4, [filteryq+ 96]
+%endif
+    paddd       m0, m3
+    paddd       m1, m4
+%if ARCH_X86_64
+    paddd       m0, m11
+    paddd       m1, m11
+%else
+    paddd       m0, [pd_64]
+    paddd       m1, [pd_64]
+%endif
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m1
+%else
+    packssdw    m0, m1
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m12
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v_fn put
+filter_v_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_v_fn put
+filter_v_fn avg
+%endif
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 8925573ade..8f2b8a6299 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
  * check XMM registers for clobbers on Win64
  * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 0000000000..0220885da6
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,983 @@
+; XVID MPEG-4 VIDEO CODEC
+;
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Christophe Gisquet <christophe.gisquet@gmail.com>
+;
+; ===========     SSE2 inverse discrete cosine transform     ===========
+;
+; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange <astrange@ithinksw.com>
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; Vertical pass is an implementation of the scheme:
+;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+;  Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+;  Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; =======     MMX and XMM forward discrete cosine transform     =======
+;
+; Copyright(C) 2001 Peter Ross <pross@xvid.org>
+;
+; Originally provided by Intel at AP-922
+; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+; but in a limited edition.
+; New macro implements a column part for precise iDCT
+; The routine precision now satisfies IEEE standard 1180-1990.
+;
+; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+;
+; http://www.elecard.com/peter/idct.html
+; http://www.linuxvideo.org/mpeg2dec/
+;
+; These examples contain code fragments for first stage iDCT 8x8
+; (for rows) and first stage DCT 8x8 (for columns)
+;
+; conversion to gcc syntax by Michael Niedermayer
+;
+; ======================================================================
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; Similar to tg_1_16 in MMX code
+tan1:   times 8 dw 13036
+tan2:   times 8 dw 27146
+tan3:   times 8 dw 43790
+sqrt2:  times 8 dw 23170
+
+; SSE2 tables
+iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
+        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
+        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
+        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
+        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
+        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
+        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
+        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
+        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
+        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
+        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
+        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+
+%if ARCH_X86_32
+; -----------------------------------------------------------------------------
+;
+; The first stage iDCT 8x8 - inverse DCTs of rows
+;
+; -----------------------------------------------------------------------------
+; The 8-point inverse DCT direct algorithm
+; -----------------------------------------------------------------------------
+;
+; static const short w[32] = {
+;     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
+;     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
+;     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
+;     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+;     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
+;     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
+;
+; #define DCT_8_INV_ROW(x, y)
+; {
+;     int a0, a1, a2, a3, b0, b1, b2, b3;
+;
+;     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
+;     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
+;     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
+;     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+;     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+;     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+;     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+;     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+;
+;     y[0] = SHIFT_ROUND(a0 + b0);
+;     y[1] = SHIFT_ROUND(a1 + b1);
+;     y[2] = SHIFT_ROUND(a2 + b2);
+;     y[3] = SHIFT_ROUND(a3 + b3);
+;     y[4] = SHIFT_ROUND(a3 - b3);
+;     y[5] = SHIFT_ROUND(a2 - b2);
+;     y[6] = SHIFT_ROUND(a1 - b1);
+;     y[7] = SHIFT_ROUND(a0 - b0);
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; In this implementation the outputs of the iDCT-1D are multiplied
+;     for rows 0,4 - by cos_4_16,
+;     for rows 1,7 - by cos_1_16,
+;     for rows 2,6 - by cos_2_16,
+;     for rows 3,5 - by cos_3_16
+; and are shifted to the left for better accuracy.
+;
+; For the constants used,
+;     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; Tables for mmx processors
+; -----------------------------------------------------------------------------
+
+; Table for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_mmx: dw  16384,  16384,  16384, -16384
+              dw  21407,   8867,   8867, -21407 ; w07 w05 w03 w01
+              dw  16384, -16384,  16384,  16384 ; w14 w12 w10 w08
+              dw  -8867,  21407, -21407,  -8867 ; w15 w13 w11 w09
+              dw  22725,  12873,  19266, -22725 ; w22 w20 w18 w16
+              dw  19266,   4520,  -4520, -12873 ; w23 w21 w19 w17
+              dw  12873,   4520,   4520,  19266 ; w30 w28 w26 w24
+              dw -22725,  19266, -12873, -22725 ; w31 w29 w27 w25
+; Table for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  22725,  22725, -22725 ; movq-> w06 w04 w02 w00
+              dw  29692,  12299,  12299, -29692 ; w07 w05 w03 w01
+              dw  22725, -22725,  22725,  22725 ; w14 w12 w10 w08
+              dw -12299,  29692, -29692, -12299 ; w15 w13 w11 w09
+              dw  31521,  17855,  26722, -31521 ; w22 w20 w18 w16
+              dw  26722,   6270,  -6270, -17855 ; w23 w21 w19 w17
+              dw  17855,   6270,   6270,  26722 ; w30 w28 w26 w24
+              dw -31521,  26722, -17855, -31521 ; w31 w29 w27 w25
+; Table for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  21407,  21407, -21407 ; movq-> w06 w04 w02 w00
+              dw  27969,  11585,  11585, -27969 ; w07 w05 w03 w01
+              dw  21407, -21407,  21407,  21407 ; w14 w12 w10 w08
+              dw -11585,  27969, -27969, -11585 ; w15 w13 w11 w09
+              dw  29692,  16819,  25172, -29692 ; w22 w20 w18 w16
+              dw  25172,   5906,  -5906, -16819 ; w23 w21 w19 w17
+              dw  16819,   5906,   5906,  25172 ; w30 w28 w26 w24
+              dw -29692,  25172, -16819, -29692 ; w31 w29 w27 w25
+; Table for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  19266,  19266, -19266 ; movq-> w06 w04 w02 w00
+              dw  25172,  10426,  10426, -25172 ; w07 w05 w03 w01
+              dw  19266, -19266,  19266,  19266 ; w14 w12 w10 w08
+              dw -10426,  25172, -25172, -10426 ; w15 w13 w11 w09
+              dw  26722,  15137,  22654, -26722 ; w22 w20 w18 w16
+              dw  22654,   5315,  -5315, -15137 ; w23 w21 w19 w17
+              dw  15137,   5315,   5315,  22654 ; w30 w28 w26 w24
+              dw -26722,  22654, -15137, -26722 ; w31 w29 w27 w25
+
+; -----------------------------------------------------------------------------
+; Tables for xmm processors
+; -----------------------------------------------------------------------------
+
+; %3 for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_xmm: dw  16384,  21407,  16384,   8867 ; movq-> w05 w04 w01 w00
+              dw  16384,   8867, -16384, -21407 ; w07 w06 w03 w02
+              dw  16384,  -8867,  16384, -21407 ; w13 w12 w09 w08
+              dw -16384,  21407,  16384,  -8867 ; w15 w14 w11 w10
+              dw  22725,  19266,  19266,  -4520 ; w21 w20 w17 w16
+              dw  12873,   4520, -22725, -12873 ; w23 w22 w19 w18
+              dw  12873, -22725,   4520, -12873 ; w29 w28 w25 w24
+              dw   4520,  19266,  19266, -22725 ; w31 w30 w27 w26
+; %3 for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  29692,  22725,  12299 ; movq-> w05 w04 w01 w00
+              dw  22725,  12299, -22725, -29692 ; w07 w06 w03 w02
+              dw  22725, -12299,  22725, -29692 ; w13 w12 w09 w08
+              dw -22725,  29692,  22725, -12299 ; w15 w14 w11 w10
+              dw  31521,  26722,  26722,  -6270 ; w21 w20 w17 w16
+              dw  17855,   6270, -31521, -17855 ; w23 w22 w19 w18
+              dw  17855, -31521,   6270, -17855 ; w29 w28 w25 w24
+              dw   6270,  26722,  26722, -31521 ; w31 w30 w27 w26
+; %3 for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  27969,  21407,  11585 ; movq-> w05 w04 w01 w00
+              dw  21407,  11585, -21407, -27969 ; w07 w06 w03 w02
+              dw  21407, -11585,  21407, -27969 ; w13 w12 w09 w08
+              dw -21407,  27969,  21407, -11585 ; w15 w14 w11 w10
+              dw  29692,  25172,  25172,  -5906 ; w21 w20 w17 w16
+              dw  16819,   5906, -29692, -16819 ; w23 w22 w19 w18
+              dw  16819, -29692,   5906, -16819 ; w29 w28 w25 w24
+              dw   5906,  25172,  25172, -29692 ; w31 w30 w27 w26
+; %3 for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  25172,  19266,  10426 ; movq-> w05 w04 w01 w00
+              dw  19266,  10426, -19266, -25172 ; w07 w06 w03 w02
+              dw  19266, -10426,  19266, -25172 ; w13 w12 w09 w08
+              dw -19266,  25172,  19266, -10426 ; w15 w14 w11 w10
+              dw  26722,  22654,  22654,  -5315 ; w21 w20 w17 w16
+              dw  15137,   5315, -26722, -15137 ; w23 w22 w19 w18
+              dw  15137, -26722,   5315, -15137 ; w29 w28 w25 w24
+              dw   5315,  22654,  22654, -26722 ; w31 w30 w27 w26
+%endif ; ~ARCH_X86_32
+
+; Similar to rounder_0 in MMX code
+; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
+walkenIdctRounders: times 4 dd 65536
+                    times 4 dd  3597
+                    times 4 dd  2260
+                    times 4 dd  1203
+                    times 4 dd   120
+                    times 4 dd   512
+                    times 2 dd     0
+
+pb_127: times 8 db 127
+
+SECTION .text
+
+; Temporary storage before the column pass
+%define ROW1 xmm6
+%define ROW3 xmm4
+%define ROW5 xmm5
+%define ROW7 xmm7
+
+%macro CLEAR_ODD 1
+    pxor      %1, %1
+%endmacro
+%macro PUT_ODD 1
+    pshufhw   %1, xmm2, 0x1B
+%endmacro
+
+%macro MOV32 2
+%if ARCH_X86_32
+    movdqa    %2, %1
+%endif
+%endmacro
+
+%macro CLEAR_EVEN 1
+%if ARCH_X86_64
+    CLEAR_ODD %1
+%endif
+%endmacro
+
+%macro PUT_EVEN 1
+%if ARCH_X86_64
+    PUT_ODD   %1
+%else
+    pshufhw xmm2, xmm2, 0x1B
+    movdqa    %1, xmm2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+%define ROW0  xmm8
+%define REG0  ROW0
+%define ROW2  xmm9
+%define REG2  ROW2
+%define ROW4  xmm10
+%define REG4  ROW4
+%define ROW6  xmm11
+%define REG6  ROW6
+%define XMMS  xmm12
+%define SREG2 REG2
+%define TAN3  xmm13
+%define TAN1  xmm14
+%else
+%define ROW0  [BLOCK + 0*16]
+%define REG0  xmm4
+%define ROW2  [BLOCK + 2*16]
+%define REG2  xmm4
+%define ROW4  [BLOCK + 4*16]
+%define REG4  xmm6
+%define ROW6  [BLOCK + 6*16]
+%define REG6  xmm6
+%define XMMS  xmm2
+%define SREG2 xmm7
+%define TAN3  xmm0
+%define TAN1  xmm2
+%endif
+
+%macro JZ  2
+    test      %1, %1
+    jz       .%2
+%endmacro
+
+%macro JNZ  2
+    test      %1, %1
+    jnz      .%2
+%endmacro
+
+%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
+    %3        %4
+    movq     mm1, [%1]
+    por      mm1, [%1 + 8]
+    paddusb  mm1, mm0
+    pmovmskb  %2, mm1
+%endmacro
+
+;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
+%macro  TEST_TWO_ROWS  8
+    %5         %6
+    %7         %8
+    movq      mm1, [%1 + 0]
+    por       mm1, [%1 + 8]
+    movq      mm2, [%2 + 0]
+    por       mm2, [%2 + 8]
+    paddusb   mm1, mm0
+    paddusb   mm2, mm0
+    pmovmskb   %3, mm1
+    pmovmskb   %4, mm2
+%endmacro
+
+; IDCT pass on rows.
+%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
+    movdqa       xmm3, [%1]
+    movdqa       xmm0, xmm3
+    pshufd       xmm1, xmm3, 0x11 ; 4602
+    punpcklqdq   xmm0, xmm0       ; 0246
+    pmaddwd      xmm0, [%2]
+    pmaddwd      xmm1, [%2+16]
+    pshufd       xmm2, xmm3, 0xBB ; 5713
+    punpckhqdq   xmm3, xmm3       ; 1357
+    pmaddwd      xmm2, [%2+32]
+    pmaddwd      xmm3, [%2+48]
+    paddd        xmm0, xmm1
+    paddd        xmm2, xmm3
+%if %0 == 5
+    paddd        xmm0, [walkenIdctRounders+%5]
+%endif
+    movdqa       xmm3, xmm2
+    paddd        xmm2, xmm0
+    psubd        xmm0, xmm3
+    psrad        xmm2, 11
+    psrad        xmm0, 11
+    packssdw     xmm2, xmm0
+    %3           %4
+%endmacro
+
+%macro iLLM_HEAD 0
+    movdqa   TAN3, [tan3]
+    movdqa   TAN1, [tan1]
+%endmacro
+
+%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
+    psraw    xmm5, 6
+    psraw    REG0, 6
+    psraw    TAN3, 6
+    psraw    xmm3, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+%else
+    ; Must now load args as gprs are no longer used for masks
+    ; DEST is set to where address of dest was loaded
+    %if ARCH_X86_32
+        %if %2 == 2 ; Not enough xmms, store
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+        %endif
+    %xdefine DEST r2q ; BLOCK is r0, stride r1
+    movifnidn DEST, destm
+    movifnidn strideq, stridem
+    %else
+    %xdefine DEST r0q
+    %endif
+    lea      r3q, [3*strideq]
+    %if %2 == 1
+    packuswb TAN3, xmm3
+    packuswb xmm5, REG0
+    movq     [DEST + strideq], TAN3
+    movhps   [DEST + 2*strideq], TAN3
+    ; REG0 and TAN3 are now available (and likely used in second half)
+    %endif
+%endif
+%endmacro
+
+%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
+    psraw    %3, 6
+    psraw    %4, 6
+    psraw    %5, 6
+    psraw    %6, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+0*16], %3
+    movdqa   [%1+3*16], %5
+    movdqa   [%1+4*16], %6
+    movdqa   [%1+7*16], %4
+%elif %2 == 1
+    packuswb %3, %5
+    packuswb %6, %4
+    ; address of dest may have been loaded
+    movq     [DEST], %3
+    movhps   [DEST + r3q], %3
+    lea      DEST, [DEST + 4*strideq]
+    movq     [DEST], %6
+    movhps   [DEST + r3q], %6
+    ; and now write remainder of first half
+    movq     [DEST + 2*strideq], xmm5
+    movhps   [DEST + strideq], xmm5
+%elif %2 == 2
+    pxor        xmm0, xmm0
+    %if ARCH_X86_32
+    ; free: m3 REG0=m4 m5
+    ; input: m1, m7, m2, m6
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    paddsw      xmm3, %3
+    paddsw      xmm4, [%1 + 1*16]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw        %3, [%1 + 2*16]
+    paddsw      xmm5, %5
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw      xmm3, %6
+    paddsw      xmm4, [%1 + 5*16]
+    paddsw        %3, [%1 + 6*16]
+    paddsw      xmm5, %4
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    %else
+    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
+    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %3
+    paddsw      xmm4, TAN3
+    paddsw     xmm12, xmm3
+    paddsw     xmm11, %5
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %6
+    paddsw      xmm4, REG0
+    paddsw     xmm12, xmm5
+    paddsw     xmm11, %4
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    %endif
+%endif
+%endmacro
+
+
+; IDCT pass on columns.
+%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
+    movdqa   xmm1, TAN3
+    movdqa   xmm3, TAN1
+    pmulhw   TAN3, xmm4
+    pmulhw   xmm1, xmm5
+    paddsw   TAN3, xmm4
+    paddsw   xmm1, xmm5
+    psubsw   TAN3, xmm5
+    paddsw   xmm1, xmm4
+    pmulhw   xmm3, xmm7
+    pmulhw   TAN1, xmm6
+    paddsw   xmm3, xmm6
+    psubsw   TAN1, xmm7
+    movdqa   xmm7, xmm3
+    movdqa   xmm6, TAN1
+    psubsw   xmm3, xmm1
+    psubsw   TAN1, TAN3
+    paddsw   xmm1, xmm7
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm7, [tan2]
+    MOV32    ROW2, REG2
+    MOV32    ROW6, REG6
+    movdqa   xmm5, xmm7
+    pmulhw   xmm7, REG6
+    pmulhw   xmm5, REG2
+    paddsw   xmm7, REG2
+    psubsw   xmm5, REG6
+    MOV32    ROW0, REG0
+    MOV32    ROW4, REG4
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, REG4
+    paddsw   REG4, XMMS
+    movdqa   XMMS, REG4
+    psubsw   REG4, xmm7
+    paddsw   xmm7, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, xmm7
+    movdqa   xmm4, REG4
+    psubsw   xmm7, xmm1
+    psubsw   REG4, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
+%endmacro
+
+; IDCT pass on columns, assuming rows 4-7 are zero
+%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, xmm4
+    movdqa   xmm3, xmm6
+    pmulhw   TAN1, xmm6
+    movdqa   xmm1, xmm4
+    psubsw   xmm3, xmm1
+    paddsw   xmm1, xmm6
+    movdqa   xmm6, TAN1
+    psubsw   TAN1, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm5, [tan2]
+    MOV32    ROW2, SREG2
+    pmulhw   xmm5, SREG2
+    MOV32    ROW0, REG0
+    movdqa   xmm6, REG0
+    psubsw   xmm6, SREG2
+    paddsw  SREG2, REG0
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, SREG2
+    movdqa   xmm4, xmm6
+    psubsw  SREG2, xmm1
+    psubsw   xmm6, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
+%endmacro
+
+%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
+%if %1 == 0 || ARCH_X86_32
+    %define GPR0  r1d
+    %define GPR1  r2d
+    %define GPR2  r3d
+    %define GPR3  r4d
+    %define NUM_GPRS 5
+%else
+    %define GPR0  r3d
+    %define GPR1  r4d
+    %define GPR2  r5d
+    %define GPR3  r6d
+    %define NUM_GPRS 7
+%endif
+%if %1 == 0
+cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
+%xdefine BLOCK blockq
+%else
+    %if %1 == 1
+cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %else
+cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %endif
+    %if ARCH_X86_64
+    %xdefine BLOCK blockq
+    %else
+    mov    r0q, blockm
+    %xdefine BLOCK r0q
+    %endif
+%endif
+    movq           mm0, [pb_127]
+    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
+    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
+    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
+
+    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
+    JZ   GPR0, col1
+    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
+.col1:
+    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
+    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
+
+    iLLM_HEAD
+    JNZ  GPR1, 2
+    JNZ  GPR0, 3
+    JNZ  GPR2, 4
+    JNZ  GPR3, 5
+    iLLM_PASS_SPARSE BLOCK, %1
+    jmp .6
+.2:
+    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
+.3:
+    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
+    JZ   GPR2, col2
+.4:
+    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
+.col2:
+    JZ   GPR3, col3
+.5:
+    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
+.col3:
+%if ARCH_X86_32
+    iLLM_HEAD
+%endif
+    iLLM_PASS     BLOCK, %1
+.6:
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_SSE2 0
+IDCT_SSE2 1
+IDCT_SSE2 2
+
+%if ARCH_X86_32
+
+; %1=offset  %2=tab_offset
+; %3=rnd_offset where 4*8->6*16  5*8->4*16  6/7*8->5*16
+%macro DCT_8_INV_ROW  3
+    movq       mm0, [r0+16*%1+0]  ; 0 ; x3 x2 x1 x0
+    movq       mm1, [r0+16*%1+8]  ; 1 ; x7 x6 x5 x4
+    movq       mm2, mm0       ; 2 ; x3 x2 x1 x0
+    movq       mm3, [%2+ 0]   ; 3 ; w06 w04 w02 w00
+%if cpuflag(mmxext)
+    pshufw     mm0, mm0, 0x88 ; x2 x0 x2 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w06 w03 w02
+    movq       mm5, mm1       ; 5 ; x7 x6 x5 x4
+    pmaddwd    mm3, mm0       ; x2*w05+x0*w04 x2*w01+x0*w00
+    movq       mm6, [%2+32]   ; 6 ; w21 w20 w17 w16
+    pshufw     mm1, mm1, 0x88 ; x6 x4 x6 x4
+    pmaddwd    mm4, mm1       ; x6*w07+x4*w06 x6*w03+x4*w02
+    movq       mm7, [%2+40]   ; 7; w23 w22 w19 w18
+    pshufw     mm2, mm2, 0xdd ; x3 x1 x3 x1
+    pmaddwd    mm6, mm2       ; x3*w21+x1*w20 x3*w17+x1*w16
+    pshufw     mm5, mm5, 0xdd ; x7 x5 x7 x5
+    pmaddwd    mm7, mm5       ; x7*w23+x5*w22 x7*w19+x5*w18
+    paddd      mm3, [walkenIdctRounders + %3]      ; +%3
+    pmaddwd    mm0, [%2+16]   ; x2*w13+x0*w12 x2*w09+x0*w08
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm1, [%2+24]   ; x6*w15+x4*w14 x6*w11+x4*w10
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm2, [%2+48]   ; x3*w29+x1*w28 x3*w25+x1*w24
+    paddd      mm6, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    pmaddwd    mm5, [%2+56]   ; x7*w31+x5*w30 x7*w27+x5*w26
+    paddd      mm3, mm6       ; a1+b1 a0+b0
+    paddd      mm0, [walkenIdctRounders + %3]      ; +%3
+    psrad      mm3, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm0, mm1       ; 1 ; a3=sum(even3) a2=sum(even2)
+    psubd      mm4, mm6       ; 6 ; a1-b1 a0-b0
+    movq       mm7, mm0       ; 7 ; a3 a2
+    paddd      mm2, mm5       ; 5 ; b3=sum(odd3) b2=sum(odd2)
+    paddd      mm0, mm2       ; a3+b3 a2+b2
+    psrad      mm4, 11        ; y6=a1-b1 y7=a0-b0
+    psubd      mm7, mm2       ; 2 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm7, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm3, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm7, mm4       ; 4 ; y6 y7 y4 y5
+    movq  [r0+16*%1+0], mm3       ; 3 ; save y3 y2 y1 y0
+    pshufw     mm7, mm7, 0xb1 ; y7 y6 y5 y4
+%else
+    punpcklwd  mm0, mm1       ; x5 x1 x4 x0
+    movq       mm5, mm0       ; 5 ; x5 x1 x4 x0
+    punpckldq  mm0, mm0       ; x4 x0 x4 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w05 w03 w01
+    punpckhwd  mm2, mm1       ; 1 ; x7 x3 x6 x2
+    pmaddwd    mm3, mm0       ; x4*w06+x0*w04 x4*w02+x0*w00
+    movq       mm6, mm2       ; 6 ; x7 x3 x6 x2
+    movq       mm1, [%2+32]   ; 1 ; w22 w20 w18 w16
+    punpckldq  mm2, mm2       ; x6 x2 x6 x2
+    pmaddwd    mm4, mm2       ; x6*w07+x2*w05 x6*w03+x2*w01
+    punpckhdq  mm5, mm5       ; x5 x1 x5 x1
+    pmaddwd    mm0, [%2+16]   ; x4*w14+x0*w12 x4*w10+x0*w08
+    punpckhdq  mm6, mm6       ; x7 x3 x7 x3
+    movq       mm7, [%2+40]   ; 7 ; w23 w21 w19 w17
+    pmaddwd    mm1, mm5       ; x5*w22+x1*w20 x5*w18+x1*w16
+    paddd      mm3, [walkenIdctRounders + %3]     ; +%3
+    pmaddwd    mm7, mm6       ; x7*w23+x3*w21 x7*w19+x3*w17
+    pmaddwd    mm2, [%2+24]   ; x6*w15+x2*w13 x6*w11+x2*w09
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm5, [%2+48]   ; x5*w30+x1*w28 x5*w26+x1*w24
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm6, [%2+56]   ; x7*w31+x3*w29 x7*w27+x3*w25
+    paddd      mm1, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    paddd      mm0, [walkenIdctRounders + %3]     ; +%3
+    psubd      mm3, mm1       ; a1-b1 a0-b0
+    psrad      mm3, 11        ; y6=a1-b1 y7=a0-b0
+    paddd      mm1, mm4       ; 4 ; a1+b1 a0+b0
+    paddd      mm0, mm2       ; 2 ; a3=sum(even3) a2=sum(even2)
+    psrad      mm1, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm5, mm6       ; 6 ; b3=sum(odd3) b2=sum(odd2)
+    movq       mm4, mm0       ; 4 ; a3 a2
+    paddd      mm0, mm5       ; a3+b3 a2+b2
+    psubd      mm4, mm5       ; 5 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm4, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm1, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm4, mm3       ; 3 ; y6 y7 y4 y5
+    movq       mm7, mm4       ; 7 ; y6 y7 y4 y5
+    psrld      mm4, 16        ; 0 y6 0 y4
+    pslld      mm7, 16        ; y7 0 y5 0
+    movq  [r0+16*%1+0], mm1   ; 1 ; save y3 y2 y1 y0
+    por        mm7, mm4       ; 4 ; y7 y6 y5 y4
+%endif
+    movq  [r0+16*%1+8], mm7   ; 7 ; save y7 y6 y5 y4
+%endmacro
+
+; -----------------------------------------------------------------------------
+;
+; The first stage DCT 8x8 - forward DCTs of columns
+;
+; The %2puts are multiplied
+; for rows 0,4 - on cos_4_16,
+; for rows 1,7 - on cos_1_16,
+; for rows 2,6 - on cos_2_16,
+; for rows 3,5 - on cos_3_16
+; and are shifted to the left for rise of accuracy
+;
+; -----------------------------------------------------------------------------
+;
+; The 8-point scaled forward DCT algorithm (26a8m)
+;
+; -----------------------------------------------------------------------------
+;
+;#define DCT_8_FRW_COL(x, y)
+; {
+;     short t0, t1, t2, t3, t4, t5, t6, t7;
+;     short tp03, tm03, tp12, tm12, tp65, tm65;
+;     short tp465, tm465, tp765, tm765;
+;
+;     t0 = LEFT_SHIFT(x[0] + x[7]);
+;     t1 = LEFT_SHIFT(x[1] + x[6]);
+;     t2 = LEFT_SHIFT(x[2] + x[5]);
+;     t3 = LEFT_SHIFT(x[3] + x[4]);
+;     t4 = LEFT_SHIFT(x[3] - x[4]);
+;     t5 = LEFT_SHIFT(x[2] - x[5]);
+;     t6 = LEFT_SHIFT(x[1] - x[6]);
+;     t7 = LEFT_SHIFT(x[0] - x[7]);
+;
+;     tp03 = t0 + t3;
+;     tm03 = t0 - t3;
+;     tp12 = t1 + t2;
+;     tm12 = t1 - t2;
+;
+;     y[0] = tp03 + tp12;
+;     y[4] = tp03 - tp12;
+;
+;     y[2] = tm03 + tm12 * tg_2_16;
+;     y[6] = tm03 * tg_2_16 - tm12;
+;
+;     tp65 = (t6 + t5) * cos_4_16;
+;     tm65 = (t6 - t5) * cos_4_16;
+;
+;     tp765 = t7 + tp65;
+;     tm765 = t7 - tp65;
+;     tp465 = t4 + tm65;
+;     tm465 = t4 - tm65;
+;
+;     y[1] = tp765 + tp465 * tg_1_16;
+;     y[7] = tp765 * tg_1_16 - tp465;
+;     y[5] = tm765 * tg_3_16 + tm465;
+;     y[3] = tm765 - tm465 * tg_3_16;
+; }
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; DCT_8_INV_COL_4  INP,OUT
+; -----------------------------------------------------------------------------
+%macro DCT_8_INV_COL 1
+    movq        mm0, [tan3]
+    movq        mm3, [%1+16*3]
+    movq        mm1, mm0 ; tg_3_16
+    movq        mm5, [%1+16*5]
+    pmulhw      mm0, mm3 ; x3*(tg_3_16-1)
+    movq        mm4, [tan1]
+    pmulhw      mm1, mm5 ; x5*(tg_3_16-1)
+    movq        mm7, [%1+16*7]
+    movq        mm2, mm4 ; tg_1_16
+    movq        mm6, [%1+16*1]
+    pmulhw      mm4, mm7 ; x7*tg_1_16
+    paddsw      mm0, mm3 ; x3*tg_3_16
+    pmulhw      mm2, mm6 ; x1*tg_1_16
+    paddsw      mm1, mm3 ; x3+x5*(tg_3_16-1)
+    psubsw      mm0, mm5 ; x3*tg_3_16-x5 = tm35
+    movq        mm3, [sqrt2]
+    paddsw      mm1, mm5 ; x3+x5*tg_3_16 = tp35
+    paddsw      mm4, mm6 ; x1+tg_1_16*x7 = tp17
+    psubsw      mm2, mm7 ; x1*tg_1_16-x7 = tm17
+    movq        mm5, mm4 ; tp17
+    movq        mm6, mm2 ; tm17
+    paddsw      mm5, mm1 ; tp17+tp35 = b0
+    psubsw      mm6, mm0 ; tm17-tm35 = b3
+    psubsw      mm4, mm1 ; tp17-tp35 = t1
+    paddsw      mm2, mm0 ; tm17+tm35 = t2
+    movq        mm7, [tan2]
+    movq        mm1, mm4 ; t1
+    movq  [%1+3*16], mm5 ; save b0
+    paddsw      mm1, mm2 ; t1+t2
+    movq  [%1+5*16], mm6 ; save b3
+    psubsw      mm4, mm2 ; t1-t2
+    movq        mm5, [%1+2*16]
+    movq        mm0, mm7 ; tg_2_16
+    movq        mm6, [%1+6*16]
+    pmulhw      mm0, mm5 ; x2*tg_2_16
+    pmulhw      mm7, mm6 ; x6*tg_2_16
+    pmulhw      mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
+    movq        mm2, [%1+0*16]
+    pmulhw      mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
+    psubsw      mm0, mm6 ; t2*tg_2_16-x6 = tm26
+    movq        mm3, mm2 ; x0
+    movq        mm6, [%1+4*16]
+    paddsw      mm7, mm5 ; x2+x6*tg_2_16 = tp26
+    paddsw      mm2, mm6 ; x0+x4 = tp04
+    psubsw      mm3, mm6 ; x0-x4 = tm04
+    movq        mm5, mm2 ; tp04
+    movq        mm6, mm3 ; tm04
+    psubsw      mm2, mm7 ; tp04-tp26 = a3
+    paddsw      mm3, mm0 ; tm04+tm26 = a1
+    paddsw      mm1, mm1 ; b1
+    paddsw      mm4, mm4 ; b2
+    paddsw      mm5, mm7 ; tp04+tp26 = a0
+    psubsw      mm6, mm0 ; tm04-tm26 = a2
+    movq        mm7, mm3 ; a1
+    movq        mm0, mm6 ; a2
+    paddsw      mm3, mm1 ; a1+b1
+    paddsw      mm6, mm4 ; a2+b2
+    psraw       mm3, 6   ; dst1
+    psubsw      mm7, mm1 ; a1-b1
+    psraw       mm6, 6   ; dst2
+    psubsw      mm0, mm4 ; a2-b2
+    movq        mm1, [%1+3*16] ; load b0
+    psraw       mm7, 6   ; dst6
+    movq        mm4, mm5 ; a0
+    psraw       mm0, 6   ; dst5
+    movq  [%1+1*16], mm3
+    paddsw      mm5, mm1 ; a0+b0
+    movq  [%1+2*16], mm6
+    psubsw      mm4, mm1 ; a0-b0
+    movq        mm3, [%1+5*16] ; load b3
+    psraw       mm5, 6   ; dst0
+    movq        mm6, mm2 ; a3
+    psraw       mm4, 6   ; dst7
+    movq  [%1+5*16], mm0
+    paddsw      mm2, mm3 ; a3+b3
+    movq  [%1+6*16], mm7
+    psubsw      mm6, mm3 ; a3-b3
+    movq  [%1+0*16], mm5
+    psraw       mm2, 6   ; dst3
+    movq  [%1+7*16], mm4
+    psraw       mm6, 6   ; dst4
+    movq  [%1+3*16], mm2
+    movq  [%1+4*16], mm6
+%endmacro
+
+%macro XVID_IDCT_MMX 0
+cglobal xvid_idct, 1, 1, 0, block
+%if cpuflag(mmxext)
+%define TAB tab_i_04_xmm
+%else
+%define TAB tab_i_04_mmx
+%endif
+    ; Process each row - beware of rounder offset
+    DCT_8_INV_ROW  0, TAB + 64 * 0, 0*16
+    DCT_8_INV_ROW  1, TAB + 64 * 1, 1*16
+    DCT_8_INV_ROW  2, TAB + 64 * 2, 2*16
+    DCT_8_INV_ROW  3, TAB + 64 * 3, 3*16
+    DCT_8_INV_ROW  4, TAB + 64 * 0, 6*16
+    DCT_8_INV_ROW  5, TAB + 64 * 3, 4*16
+    DCT_8_INV_ROW  6, TAB + 64 * 2, 5*16
+    DCT_8_INV_ROW  7, TAB + 64 * 1, 5*16
+
+    ; Process the columns (4 at a time)
+    DCT_8_INV_COL  r0+0
+    DCT_8_INV_COL  r0+8
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+XVID_IDCT_MMX
+INIT_MMX mmxext
+XVID_IDCT_MMX
+
+%endif ; ~ARCH_X86_32
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 6640b6b78c..edb5ebfd31 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -1,20 +1,20 @@
 /*
  * XVID MPEG-4 VIDEO CODEC
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@ void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 void ff_xvid_idct_sse2(short *block);
-void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block);
-void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
 
 #endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index e4f7345795..fd10953829 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,36 @@
 #include "idctdsp.h"
 #include "xvididct.h"
 
+#if ARCH_X86_32 && HAVE_YASM
+static void xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+#endif
+
 av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (high_bit_depth ||
@@ -36,24 +63,27 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
           avctx->idct_algo == FF_IDCT_XVID))
         return;
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmx_put;
-        c->idct_add  = ff_xvid_idct_mmx_add;
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmx_put;
+        c->idct_add  = xvid_idct_mmx_add;
         c->idct      = ff_xvid_idct_mmx;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmxext_put;
-        c->idct_add  = ff_xvid_idct_mmxext_add;
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmxext_put;
+        c->idct_add  = xvid_idct_mmxext_add;
         c->idct      = ff_xvid_idct_mmxext;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
+#endif
 
-    if (INLINE_SSE2(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_sse2_put;
-        c->idct_add  = ff_xvid_idct_sse2_add;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->idct_put  = ff_xvid_idct_put_sse2;
+        c->idct_add  = ff_xvid_idct_add_sse2;
         c->idct      = ff_xvid_idct_sse2;
         c->perm_type = FF_IDCT_PERM_SSE2;
     }
+#endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
deleted file mode 100644
index 9bb407cd70..0000000000
--- a/libavcodec/x86/xvididct_mmx.c
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#include "libavcodec/avcodec.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_MMX_INLINE
-
-// -----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-// -----------------------------------------------------------------------------
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4 * 4] = {
-     13036,  13036,  13036,  13036, // tg * (2 << 16) + 0.5
-     27146,  27146,  27146,  27146, // tg * (2 << 16) + 0.5
-    -21746, -21746, -21746, -21746, // tg * (2 << 16) + 0.5
-     23170,  23170,  23170,  23170
-};                                  // cos * (2 << 15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2 * 8] = {
-    65536, 65536,
-    3597,   3597,
-    2260,   2260,
-    1203,   1203,
-    0,         0,
-    120,     120,
-    512,     512,
-    512, 512
-};
-
-// -----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-// -----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-// -----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-//     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
-//     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
-//     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
-//     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-//     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
-//     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-//     int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-//     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
-//     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
-//     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
-//     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-//     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-//     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-//     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-//     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-//     y[0] = SHIFT_ROUND(a0 + b0);
-//     y[1] = SHIFT_ROUND(a1 + b1);
-//     y[2] = SHIFT_ROUND(a2 + b2);
-//     y[3] = SHIFT_ROUND(a3 + b3);
-//     y[4] = SHIFT_ROUND(a3 - b3);
-//     y[5] = SHIFT_ROUND(a2 - b2);
-//     y[6] = SHIFT_ROUND(a1 - b1);
-//     y[7] = SHIFT_ROUND(a0 - b0);
-// }
-//
-// -----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-//     for rows 0,4 - by cos_4_16,
-//     for rows 1,7 - by cos_1_16,
-//     for rows 2,6 - by cos_2_16,
-//     for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy.
-//
-// For the constants used,
-//     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// Tables for mmx processors
-// -----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32 * 4] = {
-     16384,  16384,  16384, -16384, // movq-> w06 w04 w02 w00
-     21407,   8867,   8867, -21407, // w07 w05 w03 w01
-     16384, -16384,  16384,  16384, // w14 w12 w10 w08
-     -8867,  21407, -21407,  -8867, // w15 w13 w11 w09
-     22725,  12873,  19266, -22725, // w22 w20 w18 w16
-     19266,   4520,  -4520, -12873, // w23 w21 w19 w17
-     12873,   4520,   4520,  19266, // w30 w28 w26 w24
-    -22725,  19266, -12873, -22725, // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  22725,  22725, -22725, // movq-> w06 w04 w02 w00
-     29692,  12299,  12299, -29692, // w07 w05 w03 w01
-     22725, -22725,  22725,  22725, // w14 w12 w10 w08
-    -12299,  29692, -29692, -12299, // w15 w13 w11 w09
-     31521,  17855,  26722, -31521, // w22 w20 w18 w16
-     26722,   6270,  -6270, -17855, // w23 w21 w19 w17
-     17855,   6270,   6270,  26722, // w30 w28 w26 w24
-    -31521,  26722, -17855, -31521, // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  21407,  21407, -21407, // movq-> w06 w04 w02 w00
-     27969,  11585,  11585, -27969, // w07 w05 w03 w01
-     21407, -21407,  21407,  21407, // w14 w12 w10 w08
-    -11585,  27969, -27969, -11585, // w15 w13 w11 w09
-     29692,  16819,  25172, -29692, // w22 w20 w18 w16
-     25172,   5906,  -5906, -16819, // w23 w21 w19 w17
-     16819,   5906,   5906,  25172, // w30 w28 w26 w24
-    -29692,  25172, -16819, -29692, // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  19266,  19266, -19266, // movq-> w06 w04 w02 w00
-     25172,  10426,  10426, -25172, // w07 w05 w03 w01
-     19266, -19266,  19266,  19266, // w14 w12 w10 w08
-    -10426,  25172, -25172, -10426, // w15 w13 w11 w09
-     26722,  15137,  22654, -26722, // w22 w20 w18 w16
-     22654,   5315,  -5315, -15137, // w23 w21 w19 w17
-     15137,   5315,   5315,  22654, // w30 w28 w26 w24
-    -26722,  22654, -15137, -26722, // w31 w29 w27 w25
-};
-// -----------------------------------------------------------------------------
-// Tables for xmm processors
-// -----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32 * 4] = {
-     16384,  21407,  16384,   8867, // movq-> w05 w04 w01 w00
-     16384,   8867, -16384, -21407, // w07 w06 w03 w02
-     16384,  -8867,  16384, -21407, // w13 w12 w09 w08
-    -16384,  21407,  16384,  -8867, // w15 w14 w11 w10
-     22725,  19266,  19266,  -4520, // w21 w20 w17 w16
-     12873,   4520, -22725, -12873, // w23 w22 w19 w18
-     12873, -22725,   4520, -12873, // w29 w28 w25 w24
-      4520,  19266,  19266, -22725, // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  29692,  22725,  12299, // movq-> w05 w04 w01 w00
-     22725,  12299, -22725, -29692, // w07 w06 w03 w02
-     22725, -12299,  22725, -29692, // w13 w12 w09 w08
-    -22725,  29692,  22725, -12299, // w15 w14 w11 w10
-     31521,  26722,  26722,  -6270, // w21 w20 w17 w16
-     17855,   6270, -31521, -17855, // w23 w22 w19 w18
-     17855, -31521,   6270, -17855, // w29 w28 w25 w24
-      6270,  26722,  26722, -31521, // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  27969,  21407,  11585, // movq-> w05 w04 w01 w00
-     21407,  11585, -21407, -27969, // w07 w06 w03 w02
-     21407, -11585,  21407, -27969, // w13 w12 w09 w08
-    -21407,  27969,  21407, -11585, // w15 w14 w11 w10
-     29692,  25172,  25172,  -5906, // w21 w20 w17 w16
-     16819,   5906, -29692, -16819, // w23 w22 w19 w18
-     16819, -29692,   5906, -16819, // w29 w28 w25 w24
-      5906,  25172,  25172, -29692, // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  25172,  19266,  10426, // movq-> w05 w04 w01 w00
-     19266,  10426, -19266, -25172, // w07 w06 w03 w02
-     19266, -10426,  19266, -25172, // w13 w12 w09 w08
-    -19266,  25172,  19266, -10426, // w15 w14 w11 w10
-     26722,  22654,  22654,  -5315, // w21 w20 w17 w16
-     15137,   5315, -26722, -15137, // w23 w22 w19 w18
-     15137, -26722,   5315, -15137, // w29 w28 w25 w24
-      5315,  22654,  22654, -26722, // w31 w30 w27 w26
-};
-// =============================================================================
-// Helper macros for the code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0    \n\t" /* 0 ; x3 x2 x1 x0 */                     \
-    "movq     8+"#A1", %%mm1    \n\t" /* 1 ; x7 x6 x5 x4 */                     \
-    "movq       %%mm0, %%mm2    \n\t" /* 2 ; x3 x2 x1 x0 */                     \
-    "movq       "#A3", %%mm3    \n\t" /* 3 ; w06 w04 w02 w00 */                 \
-    "punpcklwd  %%mm1, %%mm0    \n\t" /* x5 x1 x4 x0 */                         \
-    "movq       %%mm0, %%mm5    \n\t" /* 5 ; x5 x1 x4 x0 */                     \
-    "punpckldq  %%mm0, %%mm0    \n\t" /* x4 x0 x4 x0 */                         \
-    "movq     8+"#A3", %%mm4    \n\t" /* 4 ; w07 w05 w03 w01 */                 \
-    "punpckhwd  %%mm1, %%mm2    \n\t" /* 1 ; x7 x3 x6 x2 */                     \
-    "pmaddwd    %%mm0, %%mm3    \n\t" /* x4*w06+x0*w04 x4*w02+x0*w00 */         \
-    "movq       %%mm2, %%mm6    \n\t" /* 6 ; x7 x3 x6 x2 */                     \
-    "movq    32+"#A3", %%mm1    \n\t" /* 1 ; w22 w20 w18 w16 */                 \
-    "punpckldq  %%mm2, %%mm2    \n\t" /* x6 x2 x6 x2 */                         \
-    "pmaddwd    %%mm2, %%mm4    \n\t" /* x6*w07+x2*w05 x6*w03+x2*w01 */         \
-    "punpckhdq  %%mm5, %%mm5    \n\t" /* x5 x1 x5 x1 */                         \
-    "pmaddwd 16+"#A3", %%mm0    \n\t" /* x4*w14+x0*w12 x4*w10+x0*w08 */         \
-    "punpckhdq  %%mm6, %%mm6    \n\t" /* x7 x3 x7 x3 */                         \
-    "movq 40+   "#A3", %%mm7    \n\t" /* 7 ; w23 w21 w19 w17 */                 \
-    "pmaddwd    %%mm5, %%mm1    \n\t" /* x5*w22+x1*w20 x5*w18+x1*w16 */         \
-    "paddd      "#A4", %%mm3    \n\t" /* +%4 */                                 \
-    "pmaddwd    %%mm6, %%mm7    \n\t" /* x7*w23+x3*w21 x7*w19+x3*w17 */         \
-    "pmaddwd 24+"#A3", %%mm2    \n\t" /* x6*w15+x2*w13 x6*w11+x2*w09 */         \
-    "paddd      %%mm4, %%mm3    \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */     \
-    "pmaddwd 48+"#A3", %%mm5    \n\t" /* x5*w30+x1*w28 x5*w26+x1*w24 */         \
-    "movq       %%mm3, %%mm4    \n\t" /* 4 ; a1 a0 */                           \
-    "pmaddwd 56+"#A3", %%mm6    \n\t" /* x7*w31+x3*w29 x7*w27+x3*w25 */         \
-    "paddd      %%mm7, %%mm1    \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */       \
-    "paddd      "#A4", %%mm0    \n\t" /* +%4 */                                 \
-    "psubd      %%mm1, %%mm3    \n\t" /* a1-b1 a0-b0 */                         \
-    "psrad        $11, %%mm3    \n\t" /* y6=a1-b1 y7=a0-b0 */                   \
-    "paddd      %%mm4, %%mm1    \n\t" /* 4 ; a1+b1 a0+b0 */                     \
-    "paddd      %%mm2, %%mm0    \n\t" /* 2 ; a3=sum(even3) a2=sum(even2) */     \
-    "psrad        $11, %%mm1    \n\t" /* y1=a1+b1 y0=a0+b0 */                   \
-    "paddd      %%mm6, %%mm5    \n\t" /* 6 ; b3=sum(odd3) b2=sum(odd2) */       \
-    "movq       %%mm0, %%mm4    \n\t" /* 4 ; a3 a2 */                           \
-    "paddd      %%mm5, %%mm0    \n\t" /* a3+b3 a2+b2 */                         \
-    "psubd      %%mm5, %%mm4    \n\t" /* 5 ; a3-b3 a2-b2 */                     \
-    "psrad        $11, %%mm0    \n\t" /* y3=a3+b3 y2=a2+b2 */                   \
-    "psrad        $11, %%mm4    \n\t" /* y4=a3-b3 y5=a2-b2 */                   \
-    "packssdw   %%mm0, %%mm1    \n\t" /* 0 ; y3 y2 y1 y0 */                     \
-    "packssdw   %%mm3, %%mm4    \n\t" /* 3 ; y6 y7 y4 y5 */                     \
-    "movq       %%mm4, %%mm7    \n\t" /* 7 ; y6 y7 y4 y5 */                     \
-    "psrld        $16, %%mm4    \n\t" /* 0 y6 0 y4 */                           \
-    "pslld        $16, %%mm7    \n\t" /* y7 0 y5 0 */                           \
-    "movq       %%mm1, "#A2"    \n\t" /* 1 ; save y3 y2 y1 y0 */                \
-    "por        %%mm4, %%mm7    \n\t" /* 4 ; y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"  \n\t" /* 7 ; save y7 y6 y5 y4 */                \
-
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0        \n\t" /* 0 ; x3 x2 x1 x0 */                 \
-    "movq     8+"#A1", %%mm1        \n\t" /* 1 ; x7 x6 x5 x4 */                 \
-    "movq       %%mm0, %%mm2        \n\t" /* 2 ; x3 x2 x1 x0 */                 \
-    "movq       "#A3", %%mm3        \n\t" /* 3 ; w05 w04 w01 w00 */             \
-    "pshufw     $0x88, %%mm0, %%mm0 \n\t" /* x2 x0 x2 x0 */                     \
-    "movq     8+"#A3", %%mm4        \n\t" /* 4 ; w07 w06 w03 w02 */             \
-    "movq       %%mm1, %%mm5        \n\t" /* 5 ; x7 x6 x5 x4 */                 \
-    "pmaddwd    %%mm0, %%mm3        \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */     \
-    "movq    32+"#A3", %%mm6        \n\t" /* 6 ; w21 w20 w17 w16 */             \
-    "pshufw     $0x88, %%mm1, %%mm1 \n\t" /* x6 x4 x6 x4 */                     \
-    "pmaddwd    %%mm1, %%mm4        \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */     \
-    "movq    40+"#A3", %%mm7        \n\t" /* 7; w23 w22 w19 w18 */              \
-    "pshufw     $0xdd, %%mm2, %%mm2 \n\t" /* x3 x1 x3 x1 */                     \
-    "pmaddwd    %%mm2, %%mm6        \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */     \
-    "pshufw     $0xdd, %%mm5, %%mm5 \n\t" /* x7 x5 x7 x5 */                     \
-    "pmaddwd    %%mm5, %%mm7        \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */     \
-    "paddd      "#A4", %%mm3        \n\t" /* +%4 */                             \
-    "pmaddwd 16+"#A3", %%mm0        \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */     \
-    "paddd      %%mm4, %%mm3        \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \
-    "pmaddwd 24+"#A3", %%mm1        \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */     \
-    "movq       %%mm3, %%mm4        \n\t" /* 4 ; a1 a0 */                       \
-    "pmaddwd 48+"#A3", %%mm2        \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */     \
-    "paddd      %%mm7, %%mm6        \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */   \
-    "pmaddwd 56+"#A3", %%mm5        \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */     \
-    "paddd      %%mm6, %%mm3        \n\t" /* a1+b1 a0+b0 */                     \
-    "paddd      "#A4", %%mm0        \n\t" /* +%4 */                             \
-    "psrad        $11, %%mm3        \n\t" /* y1=a1+b1 y0=a0+b0 */               \
-    "paddd      %%mm1, %%mm0        \n\t" /* 1 ; a3=sum(even3) a2=sum(even2) */ \
-    "psubd      %%mm6, %%mm4        \n\t" /* 6 ; a1-b1 a0-b0 */                 \
-    "movq       %%mm0, %%mm7        \n\t" /* 7 ; a3 a2 */                       \
-    "paddd      %%mm5, %%mm2        \n\t" /* 5 ; b3=sum(odd3) b2=sum(odd2) */   \
-    "paddd      %%mm2, %%mm0        \n\t" /* a3+b3 a2+b2 */                     \
-    "psrad        $11, %%mm4        \n\t" /* y6=a1-b1 y7=a0-b0 */               \
-    "psubd      %%mm2, %%mm7        \n\t" /* 2 ; a3-b3 a2-b2 */                 \
-    "psrad        $11, %%mm0        \n\t" /* y3=a3+b3 y2=a2+b2 */               \
-    "psrad        $11, %%mm7        \n\t" /* y4=a3-b3 y5=a2-b2 */               \
-    "packssdw   %%mm0, %%mm3        \n\t" /* 0 ; y3 y2 y1 y0 */                 \
-    "packssdw   %%mm4, %%mm7        \n\t" /* 4 ; y6 y7 y4 y5 */                 \
-    "movq       %%mm3, "#A2"        \n\t" /* 3 ; save y3 y2 y1 y0 */            \
-    "pshufw     $0xb1, %%mm7, %%mm7 \n\t" /* y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"      \n\t" /* 7 ; save y7 y6 y5 y4 */            \
-
-
-// -----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-// -----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-// -----------------------------------------------------------------------------
-//
-//#define DCT_8_FRW_COL(x, y)
-// {
-//     short t0, t1, t2, t3, t4, t5, t6, t7;
-//     short tp03, tm03, tp12, tm12, tp65, tm65;
-//     short tp465, tm465, tp765, tm765;
-//
-//     t0 = LEFT_SHIFT(x[0] + x[7]);
-//     t1 = LEFT_SHIFT(x[1] + x[6]);
-//     t2 = LEFT_SHIFT(x[2] + x[5]);
-//     t3 = LEFT_SHIFT(x[3] + x[4]);
-//     t4 = LEFT_SHIFT(x[3] - x[4]);
-//     t5 = LEFT_SHIFT(x[2] - x[5]);
-//     t6 = LEFT_SHIFT(x[1] - x[6]);
-//     t7 = LEFT_SHIFT(x[0] - x[7]);
-//
-//     tp03 = t0 + t3;
-//     tm03 = t0 - t3;
-//     tp12 = t1 + t2;
-//     tm12 = t1 - t2;
-//
-//     y[0] = tp03 + tp12;
-//     y[4] = tp03 - tp12;
-//
-//     y[2] = tm03 + tm12 * tg_2_16;
-//     y[6] = tm03 * tg_2_16 - tm12;
-//
-//     tp65 = (t6 + t5) * cos_4_16;
-//     tm65 = (t6 - t5) * cos_4_16;
-//
-//     tp765 = t7 + tp65;
-//     tm765 = t7 - tp65;
-//     tp465 = t4 + tm65;
-//     tm465 = t4 - tm65;
-//
-//     y[1] = tp765 + tp465 * tg_1_16;
-//     y[7] = tp765 * tg_1_16 - tp465;
-//     y[5] = tm765 * tg_3_16 + tm465;
-//     y[3] = tm765 - tm465 * tg_3_16;
-// }
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_COL_4  INP,OUT
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1, A2)                                                   \
-    "movq    2*8(%3), %%mm0         \n\t"                                       \
-    "movq 16*3+"#A1", %%mm3         \n\t"                                       \
-    "movq      %%mm0, %%mm1         \n\t" /* tg_3_16 */                         \
-    "movq 16*5+"#A1", %%mm5         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm0         \n\t" /* x3*(tg_3_16-1) */                  \
-    "movq       (%3), %%mm4         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm1         \n\t" /* x5*(tg_3_16-1) */                  \
-    "movq 16*7+"#A1", %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm2         \n\t" /* tg_1_16 */                         \
-    "movq 16*1+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm7, %%mm4         \n\t" /* x7*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm0         \n\t" /* x3*tg_3_16 */                      \
-    "pmulhw    %%mm6, %%mm2         \n\t" /* x1*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm1         \n\t" /* x3+x5*(tg_3_16-1) */               \
-    "psubsw    %%mm5, %%mm0         \n\t" /* x3*tg_3_16-x5 = tm35 */            \
-    "movq    3*8(%3), %%mm3         \n\t"                                       \
-    "paddsw    %%mm5, %%mm1         \n\t" /* x3+x5*tg_3_16 = tp35 */            \
-    "paddsw    %%mm6, %%mm4         \n\t" /* x1+tg_1_16*x7 = tp17 */            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* x1*tg_1_16-x7 = tm17 */            \
-    "movq      %%mm4, %%mm5         \n\t" /* tp17 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* tm17 */                            \
-    "paddsw    %%mm1, %%mm5         \n\t" /* tp17+tp35 = b0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm17-tm35 = b3 */                  \
-    "psubsw    %%mm1, %%mm4         \n\t" /* tp17-tp35 = t1 */                  \
-    "paddsw    %%mm0, %%mm2         \n\t" /* tm17+tm35 = t2 */                  \
-    "movq    1*8(%3), %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm1         \n\t" /* t1 */                              \
-    "movq      %%mm5, 3*16+"#A2"    \n\t" /* save b0 */                         \
-    "paddsw    %%mm2, %%mm1         \n\t" /* t1+t2 */                           \
-    "movq      %%mm6, 5*16+"#A2"    \n\t" /* save b3 */                         \
-    "psubsw    %%mm2, %%mm4         \n\t" /* t1-t2 */                           \
-    "movq 2*16+"#A1", %%mm5         \n\t"                                       \
-    "movq      %%mm7, %%mm0         \n\t" /* tg_2_16 */                         \
-    "movq 6*16+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm0         \n\t" /* x2*tg_2_16 */                      \
-    "pmulhw    %%mm6, %%mm7         \n\t" /* x6*tg_2_16 */                      \
-    "pmulhw    %%mm3, %%mm1         \n\t" /* ocos_4_16*(t1+t2) = b1/2 */        \
-    "movq 0*16+"#A1", %%mm2         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm4         \n\t" /* ocos_4_16*(t1-t2) = b2/2 */        \
-    "psubsw    %%mm6, %%mm0         \n\t" /* t2*tg_2_16-x6 = tm26 */            \
-    "movq      %%mm2, %%mm3         \n\t" /* x0 */                              \
-    "movq 4*16+"#A1", %%mm6         \n\t"                                       \
-    "paddsw    %%mm5, %%mm7         \n\t" /* x2+x6*tg_2_16 = tp26 */            \
-    "paddsw    %%mm6, %%mm2         \n\t" /* x0+x4 = tp04 */                    \
-    "psubsw    %%mm6, %%mm3         \n\t" /* x0-x4 = tm04 */                    \
-    "movq      %%mm2, %%mm5         \n\t" /* tp04 */                            \
-    "movq      %%mm3, %%mm6         \n\t" /* tm04 */                            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* tp04-tp26 = a3 */                  \
-    "paddsw    %%mm0, %%mm3         \n\t" /* tm04+tm26 = a1 */                  \
-    "paddsw    %%mm1, %%mm1         \n\t" /* b1 */                              \
-    "paddsw    %%mm4, %%mm4         \n\t" /* b2 */                              \
-    "paddsw    %%mm7, %%mm5         \n\t" /* tp04+tp26 = a0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm04-tm26 = a2 */                  \
-    "movq      %%mm3, %%mm7         \n\t" /* a1 */                              \
-    "movq      %%mm6, %%mm0         \n\t" /* a2 */                              \
-    "paddsw    %%mm1, %%mm3         \n\t" /* a1+b1 */                           \
-    "paddsw    %%mm4, %%mm6         \n\t" /* a2+b2 */                           \
-    "psraw        $6, %%mm3         \n\t" /* dst1 */                            \
-    "psubsw    %%mm1, %%mm7         \n\t" /* a1-b1 */                           \
-    "psraw        $6, %%mm6         \n\t" /* dst2 */                            \
-    "psubsw    %%mm4, %%mm0         \n\t" /* a2-b2 */                           \
-    "movq 3*16+"#A2", %%mm1         \n\t" /* load b0 */                         \
-    "psraw        $6, %%mm7         \n\t" /* dst6 */                            \
-    "movq      %%mm5, %%mm4         \n\t" /* a0 */                              \
-    "psraw        $6, %%mm0         \n\t" /* dst5 */                            \
-    "movq      %%mm3, 1*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm1, %%mm5         \n\t" /* a0+b0 */                           \
-    "movq      %%mm6, 2*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm1, %%mm4         \n\t" /* a0-b0 */                           \
-    "movq 5*16+"#A2", %%mm3         \n\t" /* load b3 */                         \
-    "psraw        $6, %%mm5         \n\t" /* dst0 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* a3 */                              \
-    "psraw        $6, %%mm4         \n\t" /* dst7 */                            \
-    "movq      %%mm0, 5*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm3, %%mm2         \n\t" /* a3+b3 */                           \
-    "movq      %%mm7, 6*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm3, %%mm6         \n\t" /* a3-b3 */                           \
-    "movq      %%mm5, 0*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm2         \n\t" /* dst3 */                            \
-    "movq      %%mm4, 7*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm6         \n\t" /* dst4 */                            \
-    "movq      %%mm2, 3*16+"#A2"    \n\t"                                       \
-    "movq      %%mm6, 4*16+"#A2"    \n\t"                                       \
-
-// =============================================================================
-// Code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmx(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_MMX(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_MMX(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_MMX(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_MMX(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_MMX(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_MMX(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_MMX(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_MMX(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_mmx), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-// -----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmxext(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_XMM(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_XMM(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_XMM(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_XMM(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_XMM(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_XMM(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_XMM(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_XMM(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_xmm), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
deleted file mode 100644
index f318e95999..0000000000
--- a/libavcodec/x86/xvididct_sse2.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of Libav.
- *
- * Vertical pass is an implementation of the scheme:
- *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
- *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- *  Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- *  Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 IDCT compatible with the Xvid IDCT
- */
-
-#define X8(x) x, x, x, x, x, x, x, x
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[]  = { X8(13036) }; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[]  = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[]  = { X8(43790) }; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8,  uint8_t, m127)[]  = { X8(127) };
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
-    0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
-    0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
-    0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
-    0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
-    0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
-    0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
-    0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
-    0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
-    0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
-    0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
-    0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
-    0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
-    0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
-    0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
-    0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
-    0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
-    65536, 65536, 65536, 65536,
-     3597,  3597,  3597,  3597,
-     2260,  2260,  2260,  2260,
-     1203,  1203,  1203,  1203,
-      120,   120,   120,   120,
-      512,   512,   512,   512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
-    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
-    "movdqa          %%xmm2, "dst"    \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd   "MANGLE(x)
-
-#define JZ(reg, to)                         \
-    "testl     "reg","reg"            \n\t" \
-    "jz        "to"                   \n\t"
-
-#define JNZ(reg, to)                        \
-    "testl     "reg","reg"            \n\t" \
-    "jnz       "to"                   \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear)       \
-    clear                                   \
-    "movq     "src", %%mm1            \n\t" \
-    "por    8+"src", %%mm1            \n\t" \
-    "paddusb  %%mm0, %%mm1            \n\t" \
-    "pmovmskb %%mm1, "reg"            \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
-    clear1                                                    \
-    clear2                                                    \
-    "movq     "row1", %%mm1           \n\t"                   \
-    "por    8+"row1", %%mm1           \n\t"                   \
-    "movq     "row2", %%mm2           \n\t"                   \
-    "por    8+"row2", %%mm2           \n\t"                   \
-    "paddusb   %%mm0, %%mm1           \n\t"                   \
-    "paddusb   %%mm0, %%mm2           \n\t"                   \
-    "pmovmskb  %%mm1, "reg1"          \n\t"                   \
-    "pmovmskb  %%mm2, "reg2"          \n\t"
-
-/// IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put)            \
-    "movdqa        "src", %%xmm3      \n\t"            \
-    "movdqa       %%xmm3, %%xmm0      \n\t"            \
-    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
-    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
-    "pmaddwd     "table", %%xmm0      \n\t"            \
-    "pmaddwd  16+"table", %%xmm1      \n\t"            \
-    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
-    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
-    "pmaddwd  32+"table", %%xmm2      \n\t"            \
-    "pmaddwd  48+"table", %%xmm3      \n\t"            \
-    "paddd        %%xmm1, %%xmm0      \n\t"            \
-    "paddd        %%xmm3, %%xmm2      \n\t"            \
-    rounder",     %%xmm0              \n\t"            \
-    "movdqa       %%xmm2, %%xmm3      \n\t"            \
-    "paddd        %%xmm0, %%xmm2      \n\t"            \
-    "psubd        %%xmm3, %%xmm0      \n\t"            \
-    "psrad           $11, %%xmm2      \n\t"            \
-    "psrad           $11, %%xmm0      \n\t"            \
-    "packssdw     %%xmm0, %%xmm2      \n\t"            \
-    put                                                \
-    "1:                               \n\t"
-
-#define iLLM_HEAD                           \
-    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
-    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
-
-/// IDCT pass on columns.
-#define iLLM_PASS(dct)                      \
-    "movdqa   "TAN3", %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "pmulhw   %%xmm5, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm5, %%xmm1          \n\t" \
-    "psubsw   %%xmm5, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, %%xmm1          \n\t" \
-    "pmulhw   %%xmm7, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "paddsw   %%xmm6, %%xmm3          \n\t" \
-    "psubsw   %%xmm7, "TAN1"          \n\t" \
-    "movdqa   %%xmm3, %%xmm7          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm7, %%xmm1          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
-    MOV_32_ONLY ROW2", "REG2"         \n\t" \
-    MOV_32_ONLY ROW6", "REG6"         \n\t" \
-    "movdqa   %%xmm7, %%xmm5          \n\t" \
-    "pmulhw   "REG6", %%xmm7          \n\t" \
-    "pmulhw   "REG2", %%xmm5          \n\t" \
-    "paddsw   "REG2", %%xmm7          \n\t" \
-    "psubsw   "REG6", %%xmm5          \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    MOV_32_ONLY ROW4", "REG4"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   "REG4", "REG0"          \n\t" \
-    "paddsw   "XMMS", "REG4"          \n\t" \
-    "movdqa   "REG4", "XMMS"          \n\t" \
-    "psubsw   %%xmm7, "REG4"          \n\t" \
-    "paddsw   "XMMS", %%xmm7          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   %%xmm7, %%xmm0          \n\t" \
-    "movdqa   "REG4", %%xmm4          \n\t" \
-    "psubsw   %%xmm1, %%xmm7          \n\t" \
-    "psubsw   "TAN1", "REG4"          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, %%xmm7          \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, "REG4"          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   "REG4", 4*16("dct")     \n\t" \
-    "movdqa   %%xmm7, 7*16("dct")     \n\t"
-
-/// IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct)               \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "movdqa   %%xmm6, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "movdqa   %%xmm4, %%xmm1          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "paddsw   %%xmm6, %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
-    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
-    "pmulhw   "SREG2", %%xmm5         \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    "movdqa   "REG0", %%xmm6          \n\t" \
-    "psubsw   "SREG2", %%xmm6         \n\t" \
-    "paddsw   "REG0", "SREG2"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   "SREG2", %%xmm0         \n\t" \
-    "movdqa   %%xmm6, %%xmm4          \n\t" \
-    "psubsw   %%xmm1, "SREG2"         \n\t" \
-    "psubsw   "TAN1", %%xmm6          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, "SREG2"         \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm6          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
-    "movdqa   "SREG2", 7*16("dct")    \n\t"
-
-inline void ff_xvid_idct_sse2(short *block)
-{
-    __asm__ volatile (
-        "movq     "MANGLE (m127) ", %%mm0                              \n\t"
-        iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),          PUT_EVEN(ROW0))
-        iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
-        iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
-
-        TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
-        JZ("%%eax", "1f")
-        iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
-
-        TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
-        TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
-        iLLM_HEAD
-        ".p2align 4 \n\t"
-        JNZ("%%ecx", "2f")
-        JNZ("%%eax", "3f")
-        JNZ("%%edx", "4f")
-        JNZ("%%esi", "5f")
-        iLLM_PASS_SPARSE("%0")
-        "jmp 6f                                                      \n\t"
-        "2:                                                          \n\t"
-        iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
-        "3:                                                          \n\t"
-        iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
-        JZ("%%edx", "1f")
-        "4:                                                          \n\t"
-        iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
-        JZ("%%esi", "1f")
-        "5:                                                          \n\t"
-        iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
-#if ARCH_X86_32
-        iLLM_HEAD
-#endif
-        iLLM_PASS("%0")
-        "6:                                                          \n\t"
-        : "+r" (block)
-        :
-        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                       "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
-#if ARCH_X86_64
-          XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11",
-                       "%xmm12", "%xmm13", "%xmm14", )
-#endif
-          "%eax", "%ecx", "%edx", "%esi", "memory");
-}
-
-void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */