72 files changed, 2200 insertions, 403 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index eb82ef572e..3e998efafc 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -16,18 +16,22 @@ MMX-OBJS-$(CONFIG_AAC_DECODER)         += x86/sbrdsp_init.o
 MMX-OBJS-$(CONFIG_AC3DSP)              += x86/ac3dsp_mmx.o
 MMX-OBJS-$(CONFIG_CAVS_DECODER)        += x86/cavsdsp_mmx.o
 MMX-OBJS-$(CONFIG_DNXHD_ENCODER)       += x86/dnxhd_mmx.o
-MMX-OBJS-$(CONFIG_DWT)                 += x86/snowdsp_mmx.o
+MMX-OBJS-$(CONFIG_DWT)                 += x86/snowdsp_mmx.o \
+                                          x86/dwt.o
 MMX-OBJS-$(CONFIG_ENCODERS)            += x86/dsputilenc_mmx.o
 MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
+MMX-OBJS-$(CONFIG_GPL)                 += x86/idct_mmx.o
 MMX-OBJS-$(CONFIG_H264DSP)             += x86/h264dsp_mmx.o
 MMX-OBJS-$(CONFIG_H264PRED)            += x86/h264_intrapred_init.o
 MMX-OBJS-$(CONFIG_LPC)                 += x86/lpc_mmx.o
 MMX-OBJS-$(CONFIG_MPEGAUDIODSP)        += x86/mpegaudiodec_mmx.o
 MMX-OBJS-$(CONFIG_PNG_DECODER)         += x86/pngdsp-init.o
 MMX-OBJS-$(CONFIG_PRORES_DECODER)      += x86/proresdsp-init.o
+MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp-init.o
 MMX-OBJS-$(CONFIG_RV30_DECODER)        += x86/rv34dsp_init.o
 MMX-OBJS-$(CONFIG_RV40_DECODER)        += x86/rv34dsp_init.o            \
                                           x86/rv40dsp_init.o
+MMX-OBJS-$(CONFIG_V210_DECODER)        += x86/v210-init.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
 MMX-OBJS-$(CONFIG_VP5_DECODER)         += x86/vp56dsp_init.o
 MMX-OBJS-$(CONFIG_VP6_DECODER)         += x86/vp56dsp_init.o
@@ -36,9 +40,12 @@ MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32_sse.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o
 YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc_yasm.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
                                           $(YASM-OBJS-FFT-yes)
+
+YASM-OBJS-$(CONFIG_DWT)                += x86/dwt_yasm.o
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
                                           x86/h264_chromamc_10bit.o
 YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
@@ -53,9 +60,11 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_10bit.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36_sse.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                           x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp_yasm.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp56dsp.o
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index ef828bb0d5..acbab35cae 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,25 +2,25 @@
 ;* x86-optimized AC-3 DSP utils
 ;* Copyright (c) 2011 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 0ac8685c6d..5549f3e550 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -2,20 +2,20 @@
  * x86-optimized AC-3 DSP utils
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 02dbc54db7..3fcd8abbc9 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -73,10 +73,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         "jnz    2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
-        "1:                                                             \n\t"\
         "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
         "lea    -1("low")   , %%ecx                                     \n\t"\
         "xor    "low"       , %%ecx                                     \n\t"\
@@ -93,6 +90,7 @@
 
 #else /* BROKEN_RELOCATIONS */
 #define TABLES_ARG
+#define RIP_ARG
 
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,10 +132,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
-        "1:                                                             \n\t"\
         "movzwl (%%"REG_c")     , "tmp"                                 \n\t"\
         "lea    -1("low")   , %%ecx                                     \n\t"\
         "xor    "low"       , %%ecx                                     \n\t"\
@@ -155,7 +150,8 @@
 #endif /* BROKEN_RELOCATIONS */
 
 
-#if HAVE_7REGS
+#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+               && !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
@@ -209,10 +205,9 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "movzwl         (%1), %%edx     \n\t"
         "bswap         %%edx            \n\t"
         "shrl            $15, %%edx     \n\t"
+        "add              $2, %1        \n\t"
         "addl          %%edx, %%eax     \n\t"
-        "cmp         %a5(%2), %1        \n\t"
-        "jge              1f            \n\t"
-        "add"OPSIZE"      $2, %a4(%2)   \n\t"
+        "mov              %1, %a4(%2)   \n\t"
         "1:                             \n\t"
         "movl          %%eax, %a3(%2)   \n\t"
 
diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c
index 4ed7d8e598..05f192fc71 100644
--- a/libavcodec/x86/cavsdsp_mmx.c
+++ b/libavcodec/x86/cavsdsp_mmx.c
@@ -5,20 +5,20 @@
  * MMX-optimized DSP functions, based on H.264 optimizations by
  * Michael Niedermayer and Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 9d6169ca66..02b5f3fc89 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -2,25 +2,25 @@
 ;* 32 point SSE-optimized DCT transform
 ;* Copyright (c) 2010 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA 32
 
diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm
index 8613485d5d..a09473bdae 100644
--- a/libavcodec/x86/deinterlace.asm
+++ b/libavcodec/x86/deinterlace.asm
@@ -3,25 +3,25 @@
 ;* Copyright (c) 2010 Vitor Sessak
 ;* Copyright (c) 2002 Michael Niedermayer
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c
new file mode 100644
index 0000000000..693a9af4f8
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+#include "diracdsp_mmx.h"
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+    c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+    c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+    c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+    c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+    c->add_rect_clamped = ff_add_rect_clamped_mmx;
+    c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
+#endif
+#endif
+
+    PIXFUNC(put, 0, mmx);
+    PIXFUNC(avg, 0, mmx);
+
+    if (mm_flags & AV_CPU_FLAG_MMX2) {
+        PIXFUNC(avg, 0, mmx2);
+    }
+
+    if (mm_flags & AV_CPU_FLAG_SSE2) {
+#if HAVE_YASM
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+#endif
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+}
diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h
new file mode 100644
index 0000000000..3d8e1173fa
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRACDSP_H
+#define AVCODEC_X86_DIRACDSP_H
+
+#include "libavcodec/diracdsp.h"
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c);
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmx2);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm
new file mode 100644
index 0000000000..049bf16dbf
--- /dev/null
+++ b/libavcodec/x86/diracdsp_yasm.asm
@@ -0,0 +1,260 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pw_3: times 8 dw 3
+pw_7: times 8 dw 7
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+pb_128: times 16 db 128
+
+section .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,7,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_128]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy
+    lea     src2q, [srcq+src_strideq*2]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*4]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,7,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
diff --git a/libavcodec/x86/dnxhd_mmx.c b/libavcodec/x86/dnxhd_mmx.c
index 54293aa280..3c1e86955f 100644
--- a/libavcodec/x86/dnxhd_mmx.c
+++ b/libavcodec/x86/dnxhd_mmx.c
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 3daa09b314..f4ed7565e7 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -3,23 +3,23 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  */
 
 #include "libavutil/cpu.h"
@@ -31,6 +31,7 @@
 #include "libavcodec/ac3dec.h"
 #include "dsputil_mmx.h"
 #include "idct_xvid.h"
+#include "diracdsp_mmx.h"
 
 //#undef NDEBUG
 //#include <assert.h>
@@ -836,7 +837,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r"(ptr)
             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
             );
-    } else {
+    } else if(w==16){
         __asm__ volatile (
             "1:                                 \n\t"
             "movd            (%0), %%mm0        \n\t"
@@ -857,6 +858,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r"(ptr)
             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
             );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
     }
 
     /* top and bottom (and hopefully also the corners) */
@@ -2162,8 +2182,115 @@ void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
     avg_pixels8_mmx2(dst, src, stride, 8);
 }
 
+/* only used in VP3/5/6 */
+static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
+{
+//    START_TIMER
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "movq   (%1,%4), %%mm2          \n\t"
+        "movq   (%2,%4), %%mm3          \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, (%3,%4)          \n\t"
+
+        "movq   (%1,%4,2), %%mm0        \n\t"
+        "movq   (%2,%4,2), %%mm1        \n\t"
+        "movq   (%1,%5), %%mm2          \n\t"
+        "movq   (%2,%5), %%mm3          \n\t"
+        "lea    (%1,%4,4), %1           \n\t"
+        "lea    (%2,%4,4), %2           \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3,%4,2)        \n\t"
+        "movq   %%mm5, (%3,%5)          \n\t"
+        "lea    (%3,%4,4), %3           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
+        :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
+        :"memory");
+//    STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
+}
+static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
+{
+    put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
+    put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
+}
+
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_PIXOP(OPNAME, EXT)\
+void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+    OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+}
+
+DIRAC_PIXOP(put, mmx)
+DIRAC_PIXOP(avg, mmx)
+DIRAC_PIXOP(avg, mmx2)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    put_pixels16_sse2(dst   , src[0]   , stride, h);
+    put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    avg_pixels16_sse2(dst   , src[0]   , stride, h);
+    avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+}
+#endif
+
 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
  * converted. */
+#if CONFIG_GPL
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
+                                    DCTELEM *block)
+{
+    ff_mmx_idct(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
+                                    DCTELEM *block)
+{
+    ff_mmx_idct(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
+                                     DCTELEM *block)
+{
+    ff_mmxext_idct(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
+                                     DCTELEM *block)
+{
+    ff_mmxext_idct(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
 
 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
 {
@@ -2597,6 +2724,9 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 
     c->add_bytes = add_bytes_mmx;
 
+    c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
+    c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
+
     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
         c->h263_v_loop_filter = h263_v_loop_filter_mmx;
         c->h263_h_loop_filter = h263_h_loop_filter_mmx;
@@ -3014,12 +3144,25 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
 #if HAVE_INLINE_ASM
         const int idct_algo = avctx->idct_algo;
 
-        if (avctx->bits_per_raw_sample <= 8) {
+        if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
             if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
                 c->idct_put              = ff_simple_idct_put_mmx;
                 c->idct_add              = ff_simple_idct_add_mmx;
                 c->idct                  = ff_simple_idct_mmx;
                 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+#if CONFIG_GPL
+            } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
+                if (mm_flags & AV_CPU_FLAG_MMX2) {
+                    c->idct_put = ff_libmpeg2mmx2_idct_put;
+                    c->idct_add = ff_libmpeg2mmx2_idct_add;
+                    c->idct     = ff_mmxext_idct;
+                } else {
+                    c->idct_put = ff_libmpeg2mmx_idct_put;
+                    c->idct_add = ff_libmpeg2mmx_idct_add;
+                    c->idct     = ff_mmx_idct;
+                }
+                c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+#endif
             } else if (idct_algo == FF_IDCT_CAVS) {
                     c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
             } else if (idct_algo == FF_IDCT_XVIDMMX) {
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index f1db78d5c4..d0b0344917 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -2,20 +2,20 @@
  * MMX optimized DSP utils
  * Copyright (c) 2007  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dsputil_mmx_avg_template.c b/libavcodec/x86/dsputil_mmx_avg_template.c
index 8b116b74e2..6f768595c0 100644
--- a/libavcodec/x86/dsputil_mmx_avg_template.c
+++ b/libavcodec/x86/dsputil_mmx_avg_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dsputil_mmx_qns_template.c b/libavcodec/x86/dsputil_mmx_qns_template.c
index 20a40a175e..77a41b9dcb 100644
--- a/libavcodec/x86/dsputil_mmx_qns_template.c
+++ b/libavcodec/x86/dsputil_mmx_qns_template.c
@@ -5,20 +5,20 @@
  * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
  * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dsputil_mmx_rnd_template.c b/libavcodec/x86/dsputil_mmx_rnd_template.c
index 34a2c0bca8..e4c91381fa 100644
--- a/libavcodec/x86/dsputil_mmx_rnd_template.c
+++ b/libavcodec/x86/dsputil_mmx_rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index af2de15a25..06d2027c69 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -2,24 +2,24 @@
 ;* MMX optimized DSP utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
+%include "libavutil/x86/x86inc.asm"
 %include "x86util.asm"
 
 SECTION_RODATA
@@ -1371,3 +1371,4 @@ cglobal bswap32_buf, 3,4,3
     mov      [r0], r2d
 .end:
     RET
+
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 0ac4d2c10d..77b7b7620e 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -821,8 +821,9 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
 }
 #undef SUM
 
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
+static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
     x86_reg i=0;
+    if(w>=16)
     __asm__ volatile(
         "1:                             \n\t"
         "movq  (%2, %0), %%mm0          \n\t"
diff --git a/libavcodec/x86/dsputilenc_yasm.asm b/libavcodec/x86/dsputilenc_yasm.asm
index cfd4e6d28b..1be359d667 100644
--- a/libavcodec/x86/dsputilenc_yasm.asm
+++ b/libavcodec/x86/dsputilenc_yasm.asm
@@ -4,25 +4,25 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION .text
 
diff --git a/libavcodec/x86/dwt.c b/libavcodec/x86/dwt.c
new file mode 100644
index 0000000000..7ef9e2414e
--- /dev/null
+++ b/libavcodec/x86/dwt.c
@@ -0,0 +1,202 @@
+/*
+ * MMX optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "dsputil_mmx.h"
+#include "dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+                                           IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+                                          IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/dwt.h b/libavcodec/x86/dwt.h
new file mode 100644
index 0000000000..199f61175b
--- /dev/null
+++ b/libavcodec/x86/dwt.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DWT_H
+#define AVCODEC_X86_DWT_H
+
+#include "libavcodec/dwt.h"
+
+void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
+
+#endif
diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm
new file mode 100644
index 0000000000..ac6c505409
--- /dev/null
+++ b/libavcodec/x86/dwt_yasm.asm
@@ -0,0 +1,291 @@
+;******************************************************************************
+;* MMX optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_1991: times 4 dw 9,-1
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+.loop:
+    sub     widthd, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+.loop:
+    sub     widthd, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.loop:
+    sub     widthd, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+.loop:
+    sub     widthd, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+.loop:
+    sub     widthd, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/fdct_mmx.c b/libavcodec/x86/fdct_mmx.c
index 3614fd151a..f8fef4d8b2 100644
--- a/libavcodec/x86/fdct_mmx.c
+++ b/libavcodec/x86/fdct_mmx.c
@@ -13,20 +13,20 @@
  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
 
 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
 } fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct
   29692,  -12299,   26722,  -31521,
 };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
 } tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index fcde3fa797..852c6b8d1e 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 6e80b95d11..3f8b21d95b 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 6082d9ee36..faa27a01c6 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -28,8 +28,8 @@
 ; in blocks as conventient to the vector size.
 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 %if ARCH_X86_64
 %define pointer resq
@@ -37,6 +37,8 @@
 %define pointer resd
 %endif
 
+SECTION_RODATA
+
 struc FFTContext
     .nbits:    resd 1
     .reverse:  resd 1
@@ -52,8 +54,6 @@ struc FFTContext
     .imdcthalf:pointer 1
 endstruc
 
-SECTION_RODATA
-
 %define M_SQRT1_2 0.70710678118654752440
 %define M_COS_PI_1_8 0.923879532511287
 %define M_COS_PI_3_8 0.38268343236509
@@ -394,6 +394,7 @@ fft32_interleave_avx:
     sub r2d, mmsize/4
     jg .deint_loop
     ret
+
 %endif
 
 INIT_XMM sse
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 0fd14fefa3..7368b8f518 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,25 +2,25 @@
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_TEXT
 
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index fbdc5262b9..814a17f631 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index 64a4efe057..4155947e58 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,25 +3,25 @@
 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
 ;*               2005-2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 940a8f77e1..a4ab08722a 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,25 +7,25 @@
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
@@ -390,8 +390,10 @@ cglobal deblock_h_luma_8, 5,9
 
 INIT_XMM sse2
 DEBLOCK_LUMA
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_LUMA
+%endif
 
 %else
 
@@ -509,8 +511,10 @@ INIT_MMX mmx2
 DEBLOCK_LUMA v8, 8
 INIT_XMM sse2
 DEBLOCK_LUMA v, 16
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_LUMA v, 16
+%endif
 
 %endif ; ARCH
 
@@ -781,8 +785,10 @@ cglobal deblock_h_luma_intra_8, 2,4
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA v
+%endif
 %if ARCH_X86_64 == 0
 INIT_MMX mmx2
 DEBLOCK_LUMA_INTRA v8
@@ -843,7 +849,11 @@ cglobal deblock_h_chroma_8, 5,7
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call ff_chroma_inter_body_mmx2
+    LOAD_MASK  r2d, r3d
+    movd       m6, [r4] ; tc0
+    punpcklbw  m6, m6
+    pand       m7, m6
+    DEBLOCK_P0_Q0
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index ba2f91490e..d625eee4a4 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -24,8 +24,8 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
@@ -165,7 +165,7 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
     SUB        rsp, pad
     shl        r2d, 2
     shl        r3d, 2
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     mov         r3, 32/mmsize
     mov         r2, r0
     sub         r0, r1
@@ -222,7 +222,7 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
     SUB        rsp, pad
     shl        r2d, 2
     shl        r3d, 2
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     mov         r3, r1
     mova        am, m4
     add         r3, r1
@@ -351,7 +351,7 @@ cglobal deblock_v_luma_10, 5,5,15
     %define mask2 m11
     shl        r2d, 2
     shl        r3d, 2
-    LOAD_AB    m12, m13, r2, r3
+    LOAD_AB    m12, m13, r2d, r3d
     mov         r2, r0
     sub         r0, r1
     sub         r0, r1
@@ -379,7 +379,7 @@ cglobal deblock_v_luma_10, 5,5,15
 cglobal deblock_h_luma_10, 5,7,15
     shl        r2d, 2
     shl        r3d, 2
-    LOAD_AB    m12, m13, r2, r3
+    LOAD_AB    m12, m13, r2d, r3d
     mov         r2, r1
     add         r2, r1
     add         r2, r1
@@ -418,9 +418,11 @@ cglobal deblock_h_luma_10, 5,7,15
 
 INIT_XMM sse2
 DEBLOCK_LUMA_64
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_LUMA_64
 %endif
+%endif
 
 %macro SWAPMOVA 2
 %ifid %1
@@ -713,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA_64
+%endif
 
 %endif
 
@@ -798,10 +802,12 @@ DEBLOCK_LUMA_INTRA
 INIT_XMM sse2
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
 %endif
+%endif
 
 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
 ; out: %1=p0', %2=q0'
@@ -857,7 +863,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
 .loop:
 %endif
     CHROMA_V_LOAD r5
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     pxor        m4, m4
     CHROMA_V_LOAD_TC m6, r4
@@ -891,7 +897,7 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
 .loop:
 %endif
     CHROMA_V_LOAD r4
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
     CHROMA_V_STORE
@@ -912,5 +918,7 @@ DEBLOCK_CHROMA
 %endif
 INIT_XMM sse2
 DEBLOCK_CHROMA
+%if HAVE_AVX
 INIT_XMM avx
 DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index 2daa40ae52..b059cf9423 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index cc83806884..da045f71e2 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,25 +9,25 @@
 ;*          Holger Lubitz <hal@duncan.ol.sub.de>
 ;*          Min Chen <chenm001.163.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 2aab9864d6..67bc68ec5c 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -22,8 +22,8 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index 50a615b054..fd78456294 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,25 +5,25 @@
 ;* Copyright (c) 2010 Loren Merritt
 ;* Copyright (c) 2010 Ronald S. Bultje
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 1423b561ac..79fa23e71d 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -22,8 +22,8 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
 
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 08c35c7138..59ab3ea27e 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Jason Garrett-Glaser
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c
index fc1635de8b..71a1fbeed9 100644
--- a/libavcodec/x86/h264_qpel_mmx.c
+++ b/libavcodec/x86/h264_qpel_mmx.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  * Copyright (c) 2011 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1286,6 +1286,6 @@ QPEL16_OP(mc31, MMX)\
 QPEL16_OP(mc32, MMX)\
 QPEL16_OP(mc33, MMX)
 
-#if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+
+#if CONFIG_H264QPEL && ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+
 QPEL16(mmxext)
 #endif
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 22ce72d19f..26233bf080 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,24 +4,24 @@
 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
+%include "libavutil/x86/x86inc.asm"
 
 SECTION .text
 
@@ -253,6 +253,13 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
     add  off_regd, 1
     or   off_regd, 1
     add        r4, 1
+    cmp        r5, 128
+     jne .normal
+    sar        r5, 1
+    sar        r6, 1
+    sar  off_regd, 1
+    sub        r4, 1
+.normal
     movd       m4, r5d
     movd       m0, r6d
     movd       m5, off_regd
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 0612ffbb8b..87b9452501 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -133,7 +133,7 @@ LF_IFUNC(v, chroma_intra, depth, avx)
 LF_FUNCS(uint8_t,   8)
 LF_FUNCS(uint16_t, 10)
 
-#if ARCH_X86_32
+#if ARCH_X86_32 && HAVE_YASM
 LF_FUNC(v8, luma, 8, mmx2)
 static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha,
                                      int beta, int8_t *tc0)
@@ -292,7 +292,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                     c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
                     c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
                 }
-                if (mm_flags & AV_CPU_FLAG_AVX) {
+                if (HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
                     c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_avx;
                     c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c
new file mode 100644
index 0000000000..2408ab26ad
--- /dev/null
+++ b/libavcodec/x86/idct_mmx.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with mpeg2dec; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavcodec/dsputil.h"
+
+#include "libavutil/x86_cpu.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_INLINE_ASM
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+
+
+#if 0
+/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+                             int16_t * table, int32_t * rounder)
+{
+    int C1, C2, C3, C4, C5, C6, C7;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    row += offset;
+
+    C1 = table[1];
+    C2 = table[2];
+    C3 = table[3];
+    C4 = table[4];
+    C5 = table[5];
+    C6 = table[6];
+    C7 = table[7];
+
+    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)      {  c4,  c2, -c4, -c2,   \
+                                                   c4,  c6,  c4,  c6,   \
+                                                   c1,  c3, -c1, -c5,   \
+                                                   c5,  c7,  c3, -c7,   \
+                                                   c4, -c6,  c4, -c6,   \
+                                                  -c4,  c2,  c4, -c2,   \
+                                                   c5, -c1,  c3, -c1,   \
+                                                   c7,  c3,  c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * const row, const int offset,
+                                    const int16_t * const table)
+{
+    __asm__ volatile(
+        "movq     (%0), %%mm2        \n\t"  /* mm2 = x6 x4 x2 x0 */
+
+        "movq    8(%0), %%mm5        \n\t"  /* mm5 = x7 x5 x3 x1 */
+        "movq    %%mm2, %%mm0        \n\t"  /* mm0 = x6 x4 x2 x0 */
+
+        "movq     (%1), %%mm3        \n\t"  /* mm3 = -C2 -C4 C2 C4 */
+        "movq    %%mm5, %%mm6        \n\t"  /* mm6 = x7 x5 x3 x1 */
+
+        "movq    8(%1), %%mm4        \n\t"  /* mm4 = C6 C4 C6 C4 */
+        "pmaddwd %%mm0, %%mm3        \n\t"  /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+        "pshufw  $0x4e, %%mm2, %%mm2 \n\t"  /* mm2 = x2 x0 x6 x4 */
+        :: "r" ((row+offset)), "r" (table)
+    );
+}
+
+static inline void mmxext_row (const int16_t * const table,
+                               const int32_t * const rounder)
+{
+    __asm__ volatile (
+        "movq    16(%0), %%mm1         \n\t" /* mm1 = -C5 -C1 C3 C1 */
+        "pmaddwd  %%mm2, %%mm4         \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
+
+        "pmaddwd 32(%0), %%mm0         \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
+        "pshufw   $0x4e, %%mm6, %%mm6  \n\t" /* mm6 = x3 x1 x7 x5 */
+
+        "movq    24(%0), %%mm7         \n\t" /* mm7 = -C7 C3 C7 C5 */
+        "pmaddwd  %%mm5, %%mm1         \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */
+
+        "paddd     (%1), %%mm3         \n\t" /* mm3 += rounder */
+        "pmaddwd  %%mm6, %%mm7         \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
+
+        "pmaddwd 40(%0), %%mm2         \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */
+        "paddd    %%mm4, %%mm3         \n\t" /* mm3 = a1 a0 + rounder */
+
+        "pmaddwd 48(%0), %%mm5         \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
+        "movq     %%mm3, %%mm4         \n\t" /* mm4 = a1 a0 + rounder */
+
+        "pmaddwd 56(%0), %%mm6         \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
+        "paddd    %%mm7, %%mm1         \n\t" /* mm1 = b1 b0 */
+
+        "paddd     (%1), %%mm0         \n\t" /* mm0 += rounder */
+        "psubd    %%mm1, %%mm3         \n\t" /* mm3 = a1-b1 a0-b0 + rounder */
+
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3         \n\t" /* mm3 = y6 y7 */
+        "paddd    %%mm4, %%mm1         \n\t" /* mm1 = a1+b1 a0+b0 + rounder */
+
+        "paddd    %%mm2, %%mm0         \n\t" /* mm0 = a3 a2 + rounder */
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1         \n\t" /* mm1 = y1 y0 */
+
+        "paddd    %%mm6, %%mm5         \n\t" /* mm5 = b3 b2 */
+        "movq     %%mm0, %%mm4         \n\t" /* mm4 = a3 a2 + rounder */
+
+        "paddd    %%mm5, %%mm0         \n\t" /* mm0 = a3+b3 a2+b2 + rounder */
+        "psubd    %%mm5, %%mm4         \n\t" /* mm4 = a3-b3 a2-b2 + rounder */
+        : : "r" (table), "r" (rounder));
+}
+
+static inline void mmxext_row_tail (int16_t * const row, const int store)
+{
+    __asm__ volatile (
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0        \n\t"  /* mm0 = y3 y2 */
+
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4  \n\t"  /* mm4 = y4 y5 */
+
+        "packssdw %%mm0, %%mm1        \n\t"  /* mm1 = y3 y2 y1 y0 */
+
+        "packssdw %%mm3, %%mm4        \n\t"  /* mm4 = y6 y7 y4 y5 */
+
+        "movq     %%mm1, (%0)         \n\t"  /* save y3 y2 y1 y0 */
+        "pshufw   $0xb1, %%mm4, %%mm4 \n\t"  /* mm4 = y7 y6 y5 y4 */
+
+        /* slot */
+
+        "movq     %%mm4, 8(%0)        \n\t"  /* save y7 y6 y5 y4 */
+        :: "r" (row+store)
+        );
+}
+
+static inline void mmxext_row_mid (int16_t * const row, const int store,
+                                   const int offset,
+                                   const int16_t * const table)
+{
+    __asm__ volatile (
+        "movq     (%0,%1), %%mm2       \n\t" /* mm2 = x6 x4 x2 x0 */
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t"   /* mm0 = y3 y2 */
+
+        "movq    8(%0,%1), %%mm5       \n\t" /* mm5 = x7 x5 x3 x1 */
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */
+
+        "packssdw   %%mm0, %%mm1       \n\t" /* mm1 = y3 y2 y1 y0 */
+        "movq       %%mm5, %%mm6       \n\t" /* mm6 = x7 x5 x3 x1 */
+
+        "packssdw   %%mm3, %%mm4       \n\t" /* mm4 = y6 y7 y4 y5 */
+        "movq       %%mm2, %%mm0       \n\t" /* mm0 = x6 x4 x2 x0 */
+
+        "movq       %%mm1, (%0,%2)     \n\t" /* save y3 y2 y1 y0 */
+        "pshufw     $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */
+
+        "movq        (%3), %%mm3       \n\t" /* mm3 = -C2 -C4 C2 C4 */
+        "movq       %%mm4, 8(%0,%2)    \n\t" /* save y7 y6 y5 y4 */
+
+        "pmaddwd    %%mm0, %%mm3       \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+        "movq       8(%3), %%mm4       \n\t" /* mm4 = C6 C4 C6 C4 */
+        "pshufw     $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */
+        :: "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table)
+        );
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7) {  c4,  c2,  c4,  c6,   \
+                                           c4,  c6, -c4, -c2,   \
+                                           c1,  c3,  c3, -c7,   \
+                                           c5,  c7, -c1, -c5,   \
+                                           c4, -c6,  c4, -c2,   \
+                                          -c4,  c2,  c4, -c6,   \
+                                           c5, -c1,  c7, -c5,   \
+                                           c7,  c3,  c3, -c1 }
+
+static inline void mmx_row_head (int16_t * const row, const int offset,
+                                 const int16_t * const table)
+{
+    __asm__ volatile (
+        "movq (%0), %%mm2       \n\t"    /* mm2 = x6 x4 x2 x0 */
+
+        "movq 8(%0), %%mm5      \n\t"    /* mm5 = x7 x5 x3 x1 */
+        "movq %%mm2, %%mm0      \n\t"    /* mm0 = x6 x4 x2 x0 */
+
+        "movq (%1), %%mm3       \n\t"    /* mm3 = C6 C4 C2 C4 */
+        "movq %%mm5, %%mm6      \n\t"    /* mm6 = x7 x5 x3 x1 */
+
+        "punpckldq %%mm0, %%mm0 \n\t"    /* mm0 = x2 x0 x2 x0 */
+
+        "movq 8(%1), %%mm4      \n\t"    /* mm4 = -C2 -C4 C6 C4 */
+        "pmaddwd %%mm0, %%mm3   \n\t"    /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+
+        "movq 16(%1), %%mm1     \n\t"    /* mm1 = -C7 C3 C3 C1 */
+        "punpckhdq %%mm2, %%mm2 \n\t"    /* mm2 = x6 x4 x6 x4 */
+        :: "r" ((row+offset)), "r" (table)
+        );
+}
+
+static inline void mmx_row (const int16_t * const table,
+                            const int32_t * const rounder)
+{
+    __asm__ volatile (
+        "pmaddwd   %%mm2, %%mm4    \n\t"  /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
+        "punpckldq %%mm5, %%mm5    \n\t"  /* mm5 = x3 x1 x3 x1 */
+
+        "pmaddwd  32(%0), %%mm0    \n\t"  /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
+        "punpckhdq %%mm6, %%mm6    \n\t"  /* mm6 = x7 x5 x7 x5 */
+
+        "movq     24(%0), %%mm7    \n\t"  /* mm7 = -C5 -C1 C7 C5 */
+        "pmaddwd   %%mm5, %%mm1    \n\t"  /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
+
+        "paddd      (%1), %%mm3    \n\t"  /* mm3 += rounder */
+        "pmaddwd   %%mm6, %%mm7    \n\t"  /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
+
+        "pmaddwd  40(%0), %%mm2    \n\t"  /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
+        "paddd     %%mm4, %%mm3    \n\t"  /* mm3 = a1 a0 + rounder */
+
+        "pmaddwd  48(%0), %%mm5    \n\t"  /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
+        "movq      %%mm3, %%mm4    \n\t"  /* mm4 = a1 a0 + rounder */
+
+        "pmaddwd  56(%0), %%mm6    \n\t"  /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
+        "paddd     %%mm7, %%mm1    \n\t"  /* mm1 = b1 b0 */
+
+        "paddd      (%1), %%mm0    \n\t"  /* mm0 += rounder */
+        "psubd     %%mm1, %%mm3    \n\t"  /* mm3 = a1-b1 a0-b0 + rounder */
+
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3    \n\t"  /* mm3 = y6 y7 */
+        "paddd     %%mm4, %%mm1    \n\t"  /* mm1 = a1+b1 a0+b0 + rounder */
+
+        "paddd     %%mm2, %%mm0    \n\t"  /* mm0 = a3 a2 + rounder */
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1    \n\t"  /* mm1 = y1 y0 */
+
+        "paddd     %%mm6, %%mm5    \n\t"  /* mm5 = b3 b2 */
+        "movq      %%mm0, %%mm7    \n\t"  /* mm7 = a3 a2 + rounder */
+
+        "paddd     %%mm5, %%mm0    \n\t"  /* mm0 = a3+b3 a2+b2 + rounder */
+        "psubd     %%mm5, %%mm7    \n\t"  /* mm7 = a3-b3 a2-b2 + rounder */
+        :: "r" (table), "r" (rounder)
+        );
+}
+
+static inline void mmx_row_tail (int16_t * const row, const int store)
+{
+    __asm__ volatile (
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0      \n\t" /* mm0 = y3 y2 */
+
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7      \n\t" /* mm7 = y4 y5 */
+
+        "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
+
+        "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */
+
+        "movq %%mm1, (%0)      \n\t" /* save y3 y2 y1 y0 */
+        "movq %%mm7, %%mm4     \n\t" /* mm4 = y6 y7 y4 y5 */
+
+        "pslld $16, %%mm7      \n\t" /* mm7 = y7 0 y5 0 */
+
+        "psrld $16, %%mm4      \n\t" /* mm4 = 0 y6 0 y4 */
+
+        "por %%mm4, %%mm7      \n\t" /* mm7 = y7 y6 y5 y4 */
+
+        /* slot */
+
+        "movq %%mm7, 8(%0)     \n\t" /* save y7 y6 y5 y4 */
+        :: "r" (row+store)
+        );
+}
+
+static inline void mmx_row_mid (int16_t * const row, const int store,
+                                const int offset, const int16_t * const table)
+{
+
+    __asm__ volatile (
+        "movq    (%0,%1), %%mm2    \n\t" /* mm2 = x6 x4 x2 x0 */
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
+
+        "movq   8(%0,%1), %%mm5    \n\t" /* mm5 = x7 x5 x3 x1 */
+        "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */
+
+        "packssdw  %%mm0, %%mm1    \n\t" /* mm1 = y3 y2 y1 y0 */
+        "movq      %%mm5, %%mm6    \n\t" /* mm6 = x7 x5 x3 x1 */
+
+        "packssdw  %%mm3, %%mm7    \n\t" /* mm7 = y6 y7 y4 y5 */
+        "movq      %%mm2, %%mm0    \n\t" /* mm0 = x6 x4 x2 x0 */
+
+        "movq      %%mm1, (%0,%2)  \n\t" /* save y3 y2 y1 y0 */
+        "movq      %%mm7, %%mm1    \n\t" /* mm1 = y6 y7 y4 y5 */
+
+        "punpckldq %%mm0, %%mm0    \n\t" /* mm0 = x2 x0 x2 x0 */
+        "psrld       $16, %%mm7    \n\t" /* mm7 = 0 y6 0 y4 */
+
+        "movq       (%3), %%mm3    \n\t" /* mm3 = C6 C4 C2 C4 */
+        "pslld       $16, %%mm1    \n\t" /* mm1 = y7 0 y5 0 */
+
+        "movq      8(%3), %%mm4    \n\t" /* mm4 = -C2 -C4 C6 C4 */
+        "por       %%mm1, %%mm7    \n\t" /* mm7 = y7 y6 y5 y4 */
+
+        "movq     16(%3), %%mm1    \n\t" /* mm1 = -C7 C3 C3 C1 */
+        "punpckhdq %%mm2, %%mm2    \n\t" /* mm2 = x6 x4 x6 x4 */
+
+        "movq      %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */
+        "pmaddwd   %%mm0, %%mm3    \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+        : : "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table)
+        );
+}
+
+
+#if 0
+/* C column IDCT - it is just here to document the MMXEXT and MMX versions */
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+    col += offset;
+
+    x0 = col[0*8];
+    x1 = col[1*8];
+    x2 = col[2*8];
+    x3 = col[3*8];
+    x4 = col[4*8];
+    x5 = col[5*8];
+    x6 = col[6*8];
+    x7 = col[7*8];
+
+    u04 = S (x0 + x4);
+    v04 = S (x0 - x4);
+    u26 = S (F (T2, x6) + x2);
+    v26 = S (F (T2, x2) - x6);
+
+    a0 = S (u04 + u26);
+    a1 = S (v04 + v26);
+    a2 = S (v04 - v26);
+    a3 = S (u04 - u26);
+
+    u17 = S (F (T1, x7) + x1);
+    v17 = S (F (T1, x1) - x7);
+    u35 = S (F (T3, x5) + x3);
+    v35 = S (F (T3, x3) - x5);
+
+    b0 = S (u17 + u35);
+    b3 = S (v17 - v35);
+    u12 = S (u17 - u35);
+    v12 = S (v17 + v35);
+    u12 = S (2 * F (C4, u12));
+    v12 = S (2 * F (C4, v12));
+    b1 = S (u12 + v12);
+    b2 = S (u12 - v12);
+
+    y0 = S (a0 + b0) >> COL_SHIFT;
+    y1 = S (a1 + b1) >> COL_SHIFT;
+    y2 = S (a2 + b2) >> COL_SHIFT;
+    y3 = S (a3 + b3) >> COL_SHIFT;
+
+    y4 = S (a3 - b3) >> COL_SHIFT;
+    y5 = S (a2 - b2) >> COL_SHIFT;
+    y6 = S (a1 - b1) >> COL_SHIFT;
+    y7 = S (a0 - b0) >> COL_SHIFT;
+
+    col[0*8] = y0;
+    col[1*8] = y1;
+    col[2*8] = y2;
+    col[3*8] = y3;
+    col[4*8] = y4;
+    col[5*8] = y5;
+    col[6*8] = y6;
+    col[7*8] = y7;
+}
+#endif
+
+
+/* MMX column IDCT */
+static inline void idct_col (int16_t * const col, const int offset)
+{
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+    DECLARE_ALIGNED(8, static const short, t1_vector)[] = {
+        T1,T1,T1,T1,
+        T2,T2,T2,T2,
+        T3,T3,T3,T3,
+        C4,C4,C4,C4
+    };
+
+    /* column code adapted from Peter Gubanov */
+    /* http://www.elecard.com/peter/idct.shtml */
+
+    __asm__ volatile (
+        "movq      (%0), %%mm0    \n\t" /* mm0 = T1 */
+
+        "movq   2*8(%1), %%mm1    \n\t" /* mm1 = x1 */
+        "movq     %%mm0, %%mm2    \n\t" /* mm2 = T1 */
+
+        "movq 7*2*8(%1), %%mm4    \n\t" /* mm4 = x7 */
+        "pmulhw   %%mm1, %%mm0    \n\t" /* mm0 = T1*x1 */
+
+        "movq    16(%0), %%mm5    \n\t" /* mm5 = T3 */
+        "pmulhw   %%mm4, %%mm2    \n\t" /* mm2 = T1*x7 */
+
+        "movq 2*5*8(%1), %%mm6    \n\t" /* mm6 = x5 */
+        "movq     %%mm5, %%mm7    \n\t" /* mm7 = T3-1 */
+
+        "movq 3*8*2(%1), %%mm3    \n\t" /* mm3 = x3 */
+        "psubsw   %%mm4, %%mm0    \n\t" /* mm0 = v17 */
+
+        "movq     8(%0), %%mm4    \n\t" /* mm4 = T2 */
+        "pmulhw   %%mm3, %%mm5    \n\t" /* mm5 = (T3-1)*x3 */
+
+        "paddsw   %%mm2, %%mm1    \n\t" /* mm1 = u17 */
+        "pmulhw   %%mm6, %%mm7    \n\t" /* mm7 = (T3-1)*x5 */
+
+        /* slot */
+
+        "movq     %%mm4, %%mm2    \n\t" /* mm2 = T2 */
+        "paddsw   %%mm3, %%mm5    \n\t" /* mm5 = T3*x3 */
+
+        "pmulhw 2*8*2(%1), %%mm4  \n\t" /* mm4 = T2*x2 */
+        "paddsw   %%mm6, %%mm7    \n\t" /* mm7 = T3*x5 */
+
+        "psubsw   %%mm6, %%mm5    \n\t" /* mm5 = v35 */
+        "paddsw   %%mm3, %%mm7    \n\t" /* mm7 = u35 */
+
+        "movq 6*8*2(%1), %%mm3    \n\t" /* mm3 = x6 */
+        "movq     %%mm0, %%mm6    \n\t" /* mm6 = v17 */
+
+        "pmulhw   %%mm3, %%mm2    \n\t" /* mm2 = T2*x6 */
+        "psubsw   %%mm5, %%mm0    \n\t" /* mm0 = b3 */
+
+        "psubsw   %%mm3, %%mm4    \n\t" /* mm4 = v26 */
+        "paddsw   %%mm6, %%mm5    \n\t" /* mm5 = v12 */
+
+        "movq     %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */
+        "movq     %%mm1, %%mm6    \n\t" /* mm6 = u17 */
+
+        "paddsw 2*8*2(%1), %%mm2  \n\t" /* mm2 = u26 */
+        "paddsw   %%mm7, %%mm6    \n\t" /* mm6 = b0 */
+
+        "psubsw   %%mm7, %%mm1    \n\t" /* mm1 = u12 */
+        "movq     %%mm1, %%mm7    \n\t" /* mm7 = u12 */
+
+        "movq   0*8(%1), %%mm3    \n\t" /* mm3 = x0 */
+        "paddsw   %%mm5, %%mm1    \n\t" /* mm1 = u12+v12 */
+
+        "movq    24(%0), %%mm0    \n\t" /* mm0 = C4/2 */
+        "psubsw   %%mm5, %%mm7    \n\t" /* mm7 = u12-v12 */
+
+        "movq     %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */
+        "pmulhw   %%mm0, %%mm1    \n\t" /* mm1 = b1/2 */
+
+        "movq     %%mm4, %%mm6    \n\t" /* mm6 = v26 */
+        "pmulhw   %%mm0, %%mm7    \n\t" /* mm7 = b2/2 */
+
+        "movq 4*8*2(%1), %%mm5    \n\t" /* mm5 = x4 */
+        "movq     %%mm3, %%mm0    \n\t" /* mm0 = x0 */
+
+        "psubsw   %%mm5, %%mm3    \n\t" /* mm3 = v04 */
+        "paddsw   %%mm5, %%mm0    \n\t" /* mm0 = u04 */
+
+        "paddsw   %%mm3, %%mm4    \n\t" /* mm4 = a1 */
+        "movq     %%mm0, %%mm5    \n\t" /* mm5 = u04 */
+
+        "psubsw   %%mm6, %%mm3    \n\t" /* mm3 = a2 */
+        "paddsw   %%mm2, %%mm5    \n\t" /* mm5 = a0 */
+
+        "paddsw   %%mm1, %%mm1    \n\t" /* mm1 = b1 */
+        "psubsw   %%mm2, %%mm0    \n\t" /* mm0 = a3 */
+
+        "paddsw   %%mm7, %%mm7    \n\t" /* mm7 = b2 */
+        "movq     %%mm3, %%mm2    \n\t" /* mm2 = a2 */
+
+        "movq     %%mm4, %%mm6    \n\t" /* mm6 = a1 */
+        "paddsw   %%mm7, %%mm3    \n\t" /* mm3 = a2+b2 */
+
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */
+        "paddsw   %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */
+
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */
+        "psubsw   %%mm1, %%mm6    \n\t" /* mm6 = a1-b1 */
+
+        "movq 5*8*2(%1), %%mm1    \n\t" /* mm1 = b0 */
+        "psubsw   %%mm7, %%mm2    \n\t" /* mm2 = a2-b2 */
+
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */
+        "movq     %%mm5, %%mm7    \n\t" /* mm7 = a0 */
+
+        "movq     %%mm4, 1*8*2(%1)\n\t" /* save y1 */
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */
+
+        "movq     %%mm3, 2*8*2(%1)\n\t" /* save y2 */
+        "paddsw   %%mm1, %%mm5    \n\t" /* mm5 = a0+b0 */
+
+        "movq 3*8*2(%1), %%mm4    \n\t" /* mm4 = b3 */
+        "psubsw   %%mm1, %%mm7    \n\t" /* mm7 = a0-b0 */
+
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */
+        "movq     %%mm0, %%mm3    \n\t" /* mm3 = a3 */
+
+        "movq     %%mm2, 5*8*2(%1)\n\t" /* save y5 */
+        "psubsw   %%mm4, %%mm3    \n\t" /* mm3 = a3-b3 */
+
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */
+        "paddsw   %%mm0, %%mm4    \n\t" /* mm4 = a3+b3 */
+
+        "movq     %%mm5, 0*8*2(%1)\n\t" /* save y0 */
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */
+
+        "movq     %%mm6, 6*8*2(%1)\n\t" /* save y6 */
+        "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */
+
+        "movq     %%mm7, 7*8*2(%1)\n\t" /* save y7 */
+
+        "movq     %%mm3, 4*8*2(%1)\n\t" /* save y4 */
+
+        "movq     %%mm4, 3*8*2(%1)\n\t" /* save y3 */
+        :: "r" (t1_vector), "r" (col+offset)
+        );
+
+#undef T1
+#undef T2
+#undef T3
+#undef C4
+}
+
+
+DECLARE_ALIGNED(8, static const int32_t, rounder0)[] =
+    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0);
+DECLARE_ALIGNED(8, static const int32_t, rounder1)[] =
+    rounder (1.25683487303);        /* C1*(C1/C4+C1+C7)/2 */
+DECLARE_ALIGNED(8, static const int32_t, rounder7)[] =
+    rounder (-0.25);                /* C1*(C7/C4+C7-C1)/2 */
+DECLARE_ALIGNED(8, static const int32_t, rounder2)[] =
+    rounder (0.60355339059);        /* C2 * (C6+C2)/2 */
+DECLARE_ALIGNED(8, static const int32_t, rounder6)[] =
+    rounder (-0.25);                /* C2 * (C6-C2)/2 */
+DECLARE_ALIGNED(8, static const int32_t, rounder3)[] =
+    rounder (0.087788325588);       /* C3*(-C3/C4+C3+C5)/2 */
+DECLARE_ALIGNED(8, static const int32_t, rounder5)[] =
+    rounder (-0.441341716183);      /* C3*(-C5/C4+C5-C3)/2 */
+
+#undef COL_SHIFT
+#undef ROW_SHIFT
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
+void idct (int16_t * const block)                                       \
+{                                                                       \
+    DECLARE_ALIGNED(16, static const int16_t, table04)[] =              \
+        table (22725, 21407, 19266, 16384, 12873,  8867, 4520);         \
+    DECLARE_ALIGNED(16, static const int16_t, table17)[] =              \
+        table (31521, 29692, 26722, 22725, 17855, 12299, 6270);         \
+    DECLARE_ALIGNED(16, static const int16_t, table26)[] =              \
+        table (29692, 27969, 25172, 21407, 16819, 11585, 5906);         \
+    DECLARE_ALIGNED(16, static const int16_t, table35)[] =              \
+        table (26722, 25172, 22654, 19266, 15137, 10426, 5315);         \
+                                                                        \
+    idct_row_head (block, 0*8, table04);                                \
+    idct_row (table04, rounder0);                                       \
+    idct_row_mid (block, 0*8, 4*8, table04);                            \
+    idct_row (table04, rounder4);                                       \
+    idct_row_mid (block, 4*8, 1*8, table17);                            \
+    idct_row (table17, rounder1);                                       \
+    idct_row_mid (block, 1*8, 7*8, table17);                            \
+    idct_row (table17, rounder7);                                       \
+    idct_row_mid (block, 7*8, 2*8, table26);                            \
+    idct_row (table26, rounder2);                                       \
+    idct_row_mid (block, 2*8, 6*8, table26);                            \
+    idct_row (table26, rounder6);                                       \
+    idct_row_mid (block, 6*8, 3*8, table35);                            \
+    idct_row (table35, rounder3);                                       \
+    idct_row_mid (block, 3*8, 5*8, table35);                            \
+    idct_row (table35, rounder5);                                       \
+    idct_row_tail (block, 5*8);                                         \
+                                                                        \
+    idct_col (block, 0);                                                \
+    idct_col (block, 4);                                                \
+}
+
+declare_idct (ff_mmxext_idct, mmxext_table,
+              mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+declare_idct (ff_mmx_idct, mmx_table,
+              mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index e4c778c398..5d8027fb27 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -22,20 +22,20 @@
  *
  * conversion to gcc syntax by Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
+ * along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index 8249e97ccf..3708f93df8 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -9,7 +9,7 @@
  *
  * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Vertical pass is an implementation of the scheme:
  *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
@@ -23,18 +23,18 @@
  *
  * More details at http://skal.planet-d.net/coding/dct.html
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
+ * along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h
index 495d2caaf9..be91d1c68a 100644
--- a/libavcodec/x86/idct_xvid.h
+++ b/libavcodec/x86/idct_xvid.h
@@ -1,20 +1,20 @@
 /*
  * XVID MPEG-4 VIDEO CODEC
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/imdct36_sse.asm b/libavcodec/x86/imdct36_sse.asm
index 937a2cc416..336e9f0c54 100644
--- a/libavcodec/x86/imdct36_sse.asm
+++ b/libavcodec/x86/imdct36_sse.asm
@@ -371,8 +371,10 @@ DEFINE_IMDCT
 INIT_XMM ssse3
 DEFINE_IMDCT
 
+%if HAVE_AVX
 INIT_XMM avx
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse
 
@@ -717,5 +719,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
 INIT_XMM sse
 DEFINE_FOUR_IMDCT
 
+%if HAVE_AVX
 INIT_XMM avx
 DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/lpc_mmx.c b/libavcodec/x86/lpc_mmx.c
index 27bebe856a..087a324a43 100644
--- a/libavcodec/x86/lpc_mmx.c
+++ b/libavcodec/x86/lpc_mmx.c
@@ -2,20 +2,20 @@
  * MMX optimized LPC DSP utils
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index e056eb0a2d..78f37f24eb 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp.c
index 400855d7c4..7ea77fc1b8 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp.c
@@ -2,20 +2,20 @@
  * MLP DSP functions x86-optimized
  * Copyright (c) 2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c
index ab845c1129..1d3545a5e8 100644
--- a/libavcodec/x86/motion_est_mmx.c
+++ b/libavcodec/x86/motion_est_mmx.c
@@ -5,20 +5,20 @@
  *
  * mostly by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c
index 88a347796c..0d6cc08305 100644
--- a/libavcodec/x86/mpegaudiodec_mmx.c
+++ b/libavcodec/x86/mpegaudiodec_mmx.c
@@ -2,20 +2,20 @@
  * MMX optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -214,11 +214,17 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if HAVE_YASM
+#if HAVE_SSE
 DECL_IMDCT_BLOCKS(sse,sse)
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX
 DECL_IMDCT_BLOCKS(avx,avx)
+#endif
+#endif
 
 void ff_mpadsp_init_mmx(MPADSPContext *s)
 {
@@ -244,8 +250,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s)
     }
 #endif /* HAVE_INLINE_ASM */
 #if HAVE_YASM
-    if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
+    if (0) {
+#if HAVE_AVX
+    } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
         s->imdct36_blocks_float = imdct36_blocks_avx;
+#endif
 #if HAVE_SSE
     } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
         s->imdct36_blocks_float = imdct36_blocks_ssse3;
diff --git a/libavcodec/x86/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo_mmx.c
index 85f6866342..44d4cd3a8a 100644
--- a/libavcodec/x86/mpegvideo_mmx.c
+++ b/libavcodec/x86/mpegvideo_mmx.c
@@ -5,20 +5,20 @@
  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/mpegvideo_mmx_template.c b/libavcodec/x86/mpegvideo_mmx_template.c
index 53e09bdbfe..82e4ffa558 100644
--- a/libavcodec/x86/mpegvideo_mmx_template.c
+++ b/libavcodec/x86/mpegvideo_mmx_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -110,10 +110,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 
     if (s->mb_intra) {
         int dummy;
-        if (n < 4)
+        if (n < 4){
             q = s->y_dc_scale;
-        else
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
             q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
         /* note: block[0] is assumed to be positive */
         if (!s->h263_aic) {
         __asm__ volatile (
@@ -128,8 +133,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
         last_non_zero_p1 = 1;
-        bias = s->q_intra_matrix16[qscale][1];
-        qmat = s->q_intra_matrix16[qscale][0];
     } else {
         last_non_zero_p1 = 0;
         bias = s->q_inter_matrix16[qscale][1];
diff --git a/libavcodec/x86/pngdsp-init.c b/libavcodec/x86/pngdsp-init.c
index aa21847db2..7a12730620 100644
--- a/libavcodec/x86/pngdsp-init.c
+++ b/libavcodec/x86/pngdsp-init.c
@@ -2,20 +2,20 @@
  * x86 PNG optimizations.
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index d6e6374c89..8999c17c82 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -28,7 +28,7 @@ SECTION_RODATA
 
 cextern pw_255
 
-section .text align=16
+SECTION_TEXT 16
 
 ; %1 = nr. of xmm registers used
 %macro ADD_BYTES_FN 1
diff --git a/libavcodec/x86/proresdsp-init.c b/libavcodec/x86/proresdsp-init.c
index f202f9f0cf..c4aeb7f503 100644
--- a/libavcodec/x86/proresdsp-init.c
+++ b/libavcodec/x86/proresdsp-init.c
@@ -29,11 +29,14 @@ void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
 void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
                                 DCTELEM *block, const int16_t *qmat);
 
-void ff_proresdsp_x86_init(ProresDSPContext *dsp)
+void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
 #if ARCH_X86_64 && HAVE_YASM
     int flags = av_get_cpu_flags();
 
+    if(avctx->flags & CODEC_FLAG_BITEXACT)
+        return;
+
     if (flags & AV_CPU_FLAG_SSE2) {
         dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
         dsp->idct_put = ff_prores_idct_put_10_sse2;
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index bce36ac1fc..d626fdce08 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,23 +1,24 @@
 ;******************************************************************************
 ;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
 ;*
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -48,10 +49,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2
 w5_min_w1:  times 4 dw W5sh2, -W1sh2
 w5_plus_w7: times 4 dw W5sh2, +W7sh2
 w7_min_w5:  times 4 dw W7sh2, -W5sh2
-row_round:  times 8 dw (1<<14)
+pw_88:      times 8 dw 0x2008
 
+cextern pw_1
 cextern pw_4
-cextern pw_8
 cextern pw_512
 cextern pw_1019
 
@@ -92,14 +93,12 @@ section .text align=16
     ; a2 -= W6 * row[2];
     ; a3 -= W2 * row[2];
 %ifidn %1, col
-    paddw       m10,[pw_8]
+    paddw       m10,[pw_88]
 %endif
-    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
 %ifidn %1, row
-    psubw       m10,[row_round]
+    paddw       m10,[pw_1]
 %endif
-    SIGNEXTEND  m8,  m9,  m14      ; { row[2] }[0-3] / [4-7]
-    SIGNEXTEND  m10, m11, m14      ; { row[0] }[0-3] / [4-7]
+    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
     pmaddwd     m2,  m0, [w4_plus_w6]
     pmaddwd     m3,  m1, [w4_plus_w6]
     pmaddwd     m4,  m0, [w4_min_w6]
@@ -108,75 +107,33 @@ section .text align=16
     pmaddwd     m7,  m1, [w4_min_w2]
     pmaddwd     m0, [w4_plus_w2]
     pmaddwd     m1, [w4_plus_w2]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
 
     ; a0: -1*row[0]-1*row[2]
     ; a1: -1*row[0]
     ; a2: -1*row[0]
     ; a3: -1*row[0]+1*row[2]
-    psubd       m2,  m10           ; a1[0-3]
-    psubd       m3,  m11           ; a1[4-7]
-    psubd       m4,  m10           ; a2[0-3]
-    psubd       m5,  m11           ; a2[4-7]
-    psubd       m0,  m10
-    psubd       m1,  m11
-    psubd       m6,  m10
-    psubd       m7,  m11
-    psubd       m0,  m8            ; a0[0-3]
-    psubd       m1,  m9            ; a0[4-7]
-    paddd       m6,  m8            ; a3[0-3]
-    paddd       m7,  m9            ; a3[4-7]
 
     ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
     ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
     ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
     ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
     SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m14, m10      ; { row[4] }[0-3] / [4-7]
     pmaddwd     m10, m8, [w4_plus_w6]
     pmaddwd     m11, m9, [w4_plus_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10,  m13
-    psubd       m11,  m14
     paddd       m0,  m10            ; a0[0-3]
     paddd       m1,  m11            ; a0[4-7]
     pmaddwd     m10, m8, [w4_min_w6]
     pmaddwd     m11, m9, [w4_min_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10, m13
-    psubd       m11, m14
     paddd       m6,  m10           ; a3[0-3]
     paddd       m7,  m11           ; a3[4-7]
     pmaddwd     m10, m8, [w4_min_w2]
     pmaddwd     m11, m9, [w4_min_w2]
     pmaddwd     m8, [w4_plus_w2]
     pmaddwd     m9, [w4_plus_w2]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    psubd       m10, m13
-    psubd       m11, m14
-    psubd       m8,  m13
-    psubd       m9,  m14
     psubd       m4,  m10           ; a2[0-3] intermediate
     psubd       m5,  m11           ; a2[4-7] intermediate
     psubd       m2,  m8            ; a1[0-3] intermediate
     psubd       m3,  m9            ; a1[4-7] intermediate
-    SIGNEXTEND  m12, m13, m10      ; { row[6] }[0-3] / [4-7]
-    psubd       m4,  m12           ; a2[0-3]
-    psubd       m5,  m13           ; a2[4-7]
-    paddd       m2,  m12           ; a1[0-3]
-    paddd       m3,  m13           ; a1[4-7]
 
     ; load/store
     mova   [r2+  0], m0
@@ -207,8 +164,6 @@ section .text align=16
     ; b3 = MUL(W7, row[1]);
     ; MAC(b3, -W5, row[3]);
     SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
-    SIGNEXTEND  m10, m11, m12      ; { row[1] }[0-3] / [4-7]
-    SIGNEXTEND  m8,  m9,  m12      ; { row[3] }[0-3] / [4-7]
     pmaddwd     m2,  m0, [w3_min_w7]
     pmaddwd     m3,  m1, [w3_min_w7]
     pmaddwd     m4,  m0, [w5_min_w1]
@@ -217,35 +172,11 @@ section .text align=16
     pmaddwd     m7,  m1, [w7_min_w5]
     pmaddwd     m0, [w1_plus_w3]
     pmaddwd     m1, [w1_plus_w3]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
 
     ; b0: +1*row[1]+2*row[3]
     ; b1: +2*row[1]-1*row[3]
     ; b2: -1*row[1]-1*row[3]
     ; b3: +1*row[1]+1*row[3]
-    psubd       m2,  m8
-    psubd       m3,  m9
-    paddd       m0,  m8
-    paddd       m1,  m9
-    paddd       m8,  m10           ; { row[1] + row[3] }[0-3]
-    paddd       m9,  m11           ; { row[1] + row[3] }[4-7]
-    paddd       m10, m10
-    paddd       m11, m11
-    paddd       m0,  m8            ; b0[0-3]
-    paddd       m1,  m9            ; b0[4-7]
-    paddd       m2,  m10           ; b1[0-3]
-    paddd       m3,  m11           ; b2[4-7]
-    psubd       m4,  m8            ; b2[0-3]
-    psubd       m5,  m9            ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
 
     ; MAC(b0,  W5, row[5]);
     ; MAC(b0,  W7, row[7]);
@@ -256,38 +187,16 @@ section .text align=16
     ; MAC(b3,  W3, row[5]);
     ; MAC(b3, -W1, row[7]);
     SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m12, m11      ; { row[5] }[0-3] / [4-7]
-    SIGNEXTEND  m14, m11, m10      ; { row[7] }[0-3] / [4-7]
 
     ; b0: -1*row[5]+1*row[7]
     ; b1: -1*row[5]+1*row[7]
     ; b2: +1*row[5]+2*row[7]
     ; b3: +2*row[5]-1*row[7]
-    paddd       m4,  m13
-    paddd       m5,  m12
-    paddd       m6,  m13
-    paddd       m7,  m12
-    psubd       m13, m14           ; { row[5] - row[7] }[0-3]
-    psubd       m12, m11           ; { row[5] - row[7] }[4-7]
-    paddd       m14, m14
-    paddd       m11, m11
-    psubd       m0,  m13
-    psubd       m1,  m12
-    psubd       m2,  m13
-    psubd       m3,  m12
-    paddd       m4,  m14
-    paddd       m5,  m11
-    paddd       m6,  m13
-    paddd       m7,  m12
 
     pmaddwd     m10, m8, [w1_plus_w5]
     pmaddwd     m11, m9, [w1_plus_w5]
     pmaddwd     m12, m8, [w5_plus_w7]
     pmaddwd     m13, m9, [w5_plus_w7]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m12,  2
-    pslld       m13,  2
     psubd       m2,  m10           ; b1[0-3]
     psubd       m3,  m11           ; b1[4-7]
     paddd       m0,  m12            ; b0[0-3]
@@ -296,10 +205,6 @@ section .text align=16
     pmaddwd     m13, m9, [w7_plus_w3]
     pmaddwd     m8, [w3_min_w1]
     pmaddwd     m9, [w3_min_w1]
-    pslld       m12, 2
-    pslld       m13, 2
-    pslld       m8,  2
-    pslld       m9,  2
     paddd       m4,  m12           ; b2[0-3]
     paddd       m5,  m13           ; b2[4-7]
     paddd       m6,  m8            ; b3[0-3]
@@ -346,7 +251,7 @@ cglobal prores_idct_put_10, 4, 4, %1
     pmullw      m13,[r3+64]
     pmullw      m12,[r3+96]
 
-    IDCT_1D     row, 17
+    IDCT_1D     row, 15
 
     ; transpose for second part of IDCT
     TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
@@ -361,20 +266,11 @@ cglobal prores_idct_put_10, 4, 4, %1
 
     ; for (i = 0; i < 8; i++)
     ;     idctSparseColAdd(dest + i, line_size, block + i);
-    IDCT_1D     col, 20
+    IDCT_1D     col, 18
 
     ; clip/store
-    mova        m6, [pw_512]
     mova        m3, [pw_4]
     mova        m5, [pw_1019]
-    paddw       m8,  m6
-    paddw       m0,  m6
-    paddw       m1,  m6
-    paddw       m2,  m6
-    paddw       m4,  m6
-    paddw       m11, m6
-    paddw       m9,  m6
-    paddw       m10, m6
     pmaxsw      m8,  m3
     pmaxsw      m0,  m3
     pmaxsw      m1,  m3
@@ -423,7 +319,9 @@ INIT_XMM sse2
 idct_put_fn 16
 INIT_XMM sse4
 idct_put_fn 16
+%if HAVE_AVX
 INIT_XMM avx
 idct_put_fn 16
+%endif
 
 %endif
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index c43b77abd2..78d8c92b0b 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -49,7 +49,7 @@ SECTION .text
 cglobal rv34_idct_%1, 1, 2, 0
     movsx   r1, word [r0]
     IDCT_DC r1
-    movd    m0, r1
+    movd    m0, r1d
     pshufw  m0, m0, 0
     movq    [r0+ 0], m0
     movq    [r0+ 8], m0
@@ -70,7 +70,7 @@ cglobal rv34_idct_dc_add, 3, 3
     ; calculate DC
     IDCT_DC_ROUND r2
     pxor       m1, m1
-    movd       m0, r2
+    movd       m0, r2d
     psubw      m1, m0
     packuswb   m0, m0
     packuswb   m1, m1
@@ -175,7 +175,7 @@ cglobal rv34_idct_dc_add, 3, 3, 6
     pxor       m1, m1
 
     ; calculate DC
-    movd       m0, r2
+    movd       m0, r2d
     lea        r2, [r0+r1*2]
     movd       m2, [r0]
     movd       m3, [r0+r1]
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index ae740c213a..70c0c0400f 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -466,8 +466,8 @@ cglobal rv40_weight_func_%1_%2, 6, 7, 8
     add        r2, r6
     neg        r6
 
-    movd       m2, r3
-    movd       m3, r4
+    movd       m2, r3d
+    movd       m3, r4d
 %ifidn %1,rnd
 %define  RND   0
     SPLATW     m2, m2
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index bee33657b5..c508ac9328 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -56,6 +56,8 @@ DECLARE_WEIGHT(mmx2)
 DECLARE_WEIGHT(sse2)
 DECLARE_WEIGHT(ssse3)
 
+#if HAVE_YASM
+
 /** @{ */
 /**
  * Define one qpel function.
@@ -182,6 +184,8 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
 QPEL_FUNCS_SET (OP, 3, 2, OPT)
 /** @} */
 
+#endif //HAVE_YASM
+
 void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
 {
 #if HAVE_YASM
diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c
index f455eb8974..20e51a47f4 100644
--- a/libavcodec/x86/simple_idct_mmx.c
+++ b/libavcodec/x86/simple_idct_mmx.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "libavcodec/dsputil.h"
diff --git a/libavcodec/x86/snowdsp_mmx.c b/libavcodec/x86/snowdsp_mmx.c
index 770cc1cc73..5d47206e81 100644
--- a/libavcodec/x86/snowdsp_mmx.c
+++ b/libavcodec/x86/snowdsp_mmx.c
@@ -2,20 +2,20 @@
  * MMX and SSE2 optimized snow DSP utils
  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -675,14 +675,14 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM
 
 #define snow_inner_add_yblock_sse2_end_8\
              "sal $1, %%"REG_c"              \n\t"\
-             "addl $"PTR_SIZE"*2, %1         \n\t"\
+             "add"OPSIZE" $"PTR_SIZE"*2, %1  \n\t"\
              snow_inner_add_yblock_sse2_end_common1\
              "sar $1, %%"REG_c"              \n\t"\
              "sub $2, %2                     \n\t"\
              snow_inner_add_yblock_sse2_end_common2
 
 #define snow_inner_add_yblock_sse2_end_16\
-             "addl $"PTR_SIZE"*1, %1         \n\t"\
+             "add"OPSIZE" $"PTR_SIZE"*1, %1  \n\t"\
              snow_inner_add_yblock_sse2_end_common1\
              "dec %2                         \n\t"\
              snow_inner_add_yblock_sse2_end_common2
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..c961178e51
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void v210_x86_init(V210DecContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..f39419e2b2
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,89 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 2
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1_%2, 5, 5
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM
+v210_planar_unpack unaligned, ssse3
+%if HAVE_AVX
+INIT_AVX
+v210_planar_unpack unaligned, avx
+%endif
+
+INIT_XMM
+v210_planar_unpack aligned, ssse3
+%if HAVE_AVX
+INIT_AVX
+v210_planar_unpack aligned, avx
+%endif
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index aae08c2364..5922a6dbf7 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -755,6 +755,9 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
         dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
         dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
         dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+
+        if (HAVE_YASM)
+            dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
     }
 
     if (mm_flags & AV_CPU_FLAG_MMXEXT) {
@@ -782,6 +785,16 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
         dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2;
         dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
         dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
+
+        if (HAVE_YASM)
+            dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
+    } else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_3DNOW) {
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
+    }
+
+    if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSSE3) {
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
     }
 #endif /* HAVE_INLINE_ASM */
 
@@ -795,14 +808,10 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 
 #if HAVE_YASM
     if (mm_flags & AV_CPU_FLAG_MMX) {
-        dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
     }
 
     if (mm_flags & AV_CPU_FLAG_MMXEXT) {
         ASSIGN_LF(mmx2);
-        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
-    } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
-        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
     }
 
     if (mm_flags & AV_CPU_FLAG_SSE2) {
@@ -813,8 +822,6 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
     }
     if (mm_flags & AV_CPU_FLAG_SSSE3) {
         ASSIGN_LF(ssse3);
-        dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
-        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
     }
     if (mm_flags & AV_CPU_FLAG_SSE4) {
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse4;
diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp_yasm.asm
index ced2b5ba88..590aa509a7 100644
--- a/libavcodec/x86/vc1dsp_yasm.asm
+++ b/libavcodec/x86/vc1dsp_yasm.asm
@@ -2,25 +2,25 @@
 ;* VC1 deblocking optimizations
 ;* Copyright (c) 2009 David Conrad
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 cextern pw_4
 cextern pw_5
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 7a88892c11..dcd55f5f8b 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,25 +2,25 @@
 ;* MMX/SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 ; MMX-optimized functions cribbed from the original VP3 source code.
 
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 45af041a6e..8905452dcf 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index be2dd30b8d..ddbf38b1a9 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,20 +4,20 @@
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (C) 2010  Eli Friedman
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm
index ca4d97ec15..21ff041d61 100644
--- a/libavcodec/x86/vp56dsp.asm
+++ b/libavcodec/x86/vp56dsp.asm
@@ -3,25 +3,25 @@
 ;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
 ;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 cextern pw_64
 
diff --git a/libavcodec/x86/vp56dsp_init.c b/libavcodec/x86/vp56dsp_init.c
index ae04440611..9108aa446d 100644
--- a/libavcodec/x86/vp56dsp_init.c
+++ b/libavcodec/x86/vp56dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
  * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 64dd8ceadf..8c17fa0382 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 531b205b7b..75c0b56f94 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,25 +3,25 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA