23 files changed, 1104 insertions, 316 deletions
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 1e19082233..94d8832062 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,8 +1,16 @@
 OBJS += x86/cpu.o                                                       \
+        x86/fixed_dsp_init.o                                            \
         x86/float_dsp_init.o                                            \
         x86/lls_init.o                                                  \
 
+OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o                      \
+
+EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
+
 YASM-OBJS += x86/cpuid.o                                                \
-             x86/emms.o                                                 \
+             $(EMMS_OBJS__yes_)                                      \
+             x86/fixed_dsp.o                                            \
              x86/float_dsp.o                                            \
              x86/lls.o                                                  \
+
+YASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o                      \
diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index e30f5dbaf7..109b65e542 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 #include "config.h"
 
 typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
+typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
 
 #if ARCH_X86_64
 #    define OPSIZE "q"
@@ -37,7 +38,8 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
 #    define PTR_SIZE "8"
 typedef int64_t x86_reg;
 
-#    define REG_SP "rsp"
+/* REG_SP is defined in Solaris sys headers, so use REG_sp */
+#    define REG_sp "rsp"
 #    define REG_BP "rbp"
 #    define REGBP   rbp
 #    define REGa    rax
@@ -58,7 +60,7 @@ typedef int64_t x86_reg;
 #    define PTR_SIZE "4"
 typedef int32_t x86_reg;
 
-#    define REG_SP "esp"
+#    define REG_sp "esp"
 #    define REG_BP "ebp"
 #    define REGBP   ebp
 #    define REGa    eax
@@ -107,6 +109,46 @@ typedef int x86_reg;
 #    define LOCAL_MANGLE(a) #a
 #endif
 
-#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
+#   define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+#   define NAMED_CONSTRAINTS_ADD(...)
+#   define NAMED_CONSTRAINTS(...)
+#   define NAMED_CONSTRAINTS_ARRAY_ADD(...)
+#   define NAMED_CONSTRAINTS_ARRAY(...)
+#else
+    /* When direct symbol references are used in code passed to a compiler that does not support them
+     *  then these references need to be converted to named asm constraints instead.
+     * Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol.
+     * In order for this to work there must also be a corresponding entry in the asm-interface. To add this
+     *  entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the
+     *  corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ).
+     * If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list.
+     */
+#   define MANGLE(a) "%["#a"]"
+    // Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work
+#   define FE_0(P,X) P(X)
+#   define FE_1(P,X,X1) P(X), FE_0(P,X1)
+#   define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
+#   define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
+#   define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
+#   define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
+#   define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
+#   define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
+#   define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
+#   define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
+#   define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
+#   define GET_FE(A) GET_FE_IMPL A
+#   define GET_FE_GLUE(x, y) x y
+#   define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))
+#   define NAME_CONSTRAINT(x) [x] "m"(x)
+    // Parameters are a list of each symbol reference required
+#   define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
+    // Same but without comma for when there are no previously defined constraints
+#   define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
+    // Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables
+#   define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x)
+#   define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
+#   define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
+#endif
 
 #endif /* AVUTIL_X86_ASM_H */
diff --git a/libavutil/x86/bswap.h b/libavutil/x86/bswap.h
index c73be9af81..ffa59e4c82 100644
--- a/libavutil/x86/bswap.h
+++ b/libavutil/x86/bswap.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,21 +25,47 @@
 #define AVUTIL_X86_BSWAP_H
 
 #include <stdint.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
 #include "config.h"
 #include "libavutil/attributes.h"
 
-#if HAVE_INLINE_ASM
+#if defined(_MSC_VER)
+
+#define av_bswap16 av_bswap16
+static av_always_inline av_const uint16_t av_bswap16(uint16_t x)
+{
+    return _rotr16(x, 8);
+}
+
+#define av_bswap32 av_bswap32
+static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
+{
+    return _byteswap_ulong(x);
+}
+
+#if ARCH_X86_64
+#define av_bswap64 av_bswap64
+static inline uint64_t av_const av_bswap64(uint64_t x)
+{
+    return _byteswap_uint64(x);
+}
+#endif
+
+
+#elif HAVE_INLINE_ASM
 
-#if !AV_GCC_VERSION_AT_LEAST(4,1)
+#if AV_GCC_VERSION_AT_MOST(4,0)
 #define av_bswap16 av_bswap16
 static av_always_inline av_const unsigned av_bswap16(unsigned x)
 {
     __asm__("rorw $8, %w0" : "+r"(x));
     return x;
 }
-#endif /* !AV_GCC_VERSION_AT_LEAST(4,1) */
+#endif /* AV_GCC_VERSION_AT_MOST(4,0) */
 
-#if !AV_GCC_VERSION_AT_LEAST(4,5)
+#if AV_GCC_VERSION_AT_MOST(4,4) || defined(__INTEL_COMPILER)
 #define av_bswap32 av_bswap32
 static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
 {
@@ -55,7 +81,7 @@ static inline uint64_t av_const av_bswap64(uint64_t x)
     return x;
 }
 #endif
-#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */
+#endif /* AV_GCC_VERSION_AT_MOST(4,4) */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVUTIL_X86_BSWAP_H */
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 098ccf7004..f57d72d0a7 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -3,20 +3,20 @@
  * (c)1997-99 by H. Dietz and R. Fisher
  * Converted to C and improved by Fabrice Bellard.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@
         "cpuid                       \n\t"                      \
         "xchg   %%"REG_b", %%"REG_S                             \
         : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)        \
-        : "0" (index))
+        : "0" (index), "2"(0))
 
 #define xgetbv(index, eax, edx)                                 \
     __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
@@ -126,6 +126,8 @@ int ff_get_cpu_flags_x86(void)
             rval |= AV_CPU_FLAG_SSE4;
         if (ecx & 0x00100000 )
             rval |= AV_CPU_FLAG_SSE42;
+        if (ecx & 0x01000000 )
+            rval |= AV_CPU_FLAG_AESNI;
 #if HAVE_AVX
         /* Check OXSAVE and AVX bits */
         if ((ecx & 0x18000000) == 0x18000000) {
@@ -143,7 +145,7 @@ int ff_get_cpu_flags_x86(void)
     if (max_std_level >= 7) {
         cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-        if (ebx & 0x00000020)
+        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
             rval |= AV_CPU_FLAG_AVX2;
 #endif /* HAVE_AVX2 */
         /* BMI1/2 don't need OS support */
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 0695436548..dc102c6015 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,7 @@
 #define X86_FMA3(flags)             CPUEXT(flags, FMA3)
 #define X86_FMA4(flags)             CPUEXT(flags, FMA4)
 #define X86_AVX2(flags)             CPUEXT(flags, AVX2)
+#define X86_AESNI(flags)            CPUEXT(flags, AESNI)
 
 #define EXTERNAL_AMD3DNOW(flags)    CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW)
 #define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT)
@@ -69,6 +70,7 @@
 #define EXTERNAL_FMA3(flags)        CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3)
 #define EXTERNAL_FMA4(flags)        CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4)
 #define EXTERNAL_AVX2(flags)        CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
+#define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
 
 #define INLINE_AMD3DNOW(flags)      CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
 #define INLINE_AMD3DNOWEXT(flags)   CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
@@ -91,6 +93,7 @@
 #define INLINE_FMA3(flags)          CPUEXT_SUFFIX(flags, _INLINE, FMA3)
 #define INLINE_FMA4(flags)          CPUEXT_SUFFIX(flags, _INLINE, FMA4)
 #define INLINE_AVX2(flags)          CPUEXT_SUFFIX(flags, _INLINE, AVX2)
+#define INLINE_AESNI(flags)         CPUEXT_SUFFIX(flags, _INLINE, AESNI)
 
 void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx);
 void ff_cpu_xgetbv(int op, int *eax, int *edx);
diff --git a/libavutil/x86/cpuid.asm b/libavutil/x86/cpuid.asm
index 1cb8e94ea3..c3f7866ec7 100644
--- a/libavutil/x86/cpuid.asm
+++ b/libavutil/x86/cpuid.asm
@@ -4,20 +4,20 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavutil/x86/emms.asm b/libavutil/x86/emms.asm
index a6851acc99..0aad34af3f 100644
--- a/libavutil/x86/emms.asm
+++ b/libavutil/x86/emms.asm
@@ -1,20 +1,20 @@
 ;*****************************************************************************
 ;* Copyright (C) 2013 Martin Storsjo
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavutil/x86/emms.h b/libavutil/x86/emms.h
index 2ed9e5d09d..a529b6bbbe 100644
--- a/libavutil/x86/emms.h
+++ b/libavutil/x86/emms.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
 
 void avpriv_emms_yasm(void);
 
@@ -33,7 +34,8 @@ void avpriv_emms_yasm(void);
  */
 static av_always_inline void emms_c(void)
 {
-    __asm__ volatile ("emms" ::: "memory");
+    if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
+        __asm__ volatile ("emms" ::: "memory");
 }
 #elif HAVE_MMX && HAVE_MM_EMPTY
 #   include <mmintrin.h>
diff --git a/libavutil/x86/fixed_dsp.asm b/libavutil/x86/fixed_dsp.asm
new file mode 100644
index 0000000000..979dd5c334
--- /dev/null
+++ b/libavutil/x86/fixed_dsp.asm
@@ -0,0 +1,48 @@
+;*****************************************************************************
+;* x86-optimized Float DSP functions
+;*
+;* Copyright 2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void ff_butterflies_fixed(float *src0, float *src1, int len);
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal butterflies_fixed, 3,3,3, src0, src1, len
+    shl       lend, 2
+    add      src0q, lenq
+    add      src1q, lenq
+    neg       lenq
+
+align 16
+.loop:
+    mova        m0, [src0q + lenq]
+    mova        m1, [src1q + lenq]
+    mova        m2, m0
+    paddd       m0, m1
+    psubd       m2, m1
+    mova        [src0q + lenq], m0
+    mova        [src1q + lenq], m2
+    add       lenq, mmsize
+    jl .loop
+    RET
diff --git a/libavutil/x86/fixed_dsp_init.c b/libavutil/x86/fixed_dsp_init.c
new file mode 100644
index 0000000000..303a2eb922
--- /dev/null
+++ b/libavutil/x86/fixed_dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/fixed_dsp.h"
+#include "cpu.h"
+
+void ff_butterflies_fixed_sse2(int *src0, int *src1, int len);
+
+av_cold void ff_fixed_dsp_init_x86(AVFixedDSPContext *fdsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        fdsp->butterflies_fixed = ff_butterflies_fixed_sse2;
+    }
+}
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index d96249978a..021ff03c87 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -1,20 +1,22 @@
 ;*****************************************************************************
 ;* x86-optimized Float DSP functions
 ;*
-;* This file is part of Libav.
+;* Copyright 2006 Loren Merritt
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -48,8 +50,10 @@ ALIGN 16
 
 INIT_XMM sse
 VECTOR_FMUL
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 VECTOR_FMUL
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
@@ -57,33 +61,48 @@ VECTOR_FMUL
 
 %macro VECTOR_FMAC_SCALAR 0
 %if UNIX64
-cglobal vector_fmac_scalar, 3,3,3, dst, src, len
+cglobal vector_fmac_scalar, 3,3,5, dst, src, len
 %else
-cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
+cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
 %endif
 %if ARCH_X86_32
     VBROADCASTSS m0, mulm
 %else
 %if WIN64
-    mova       xmm0, xmm2
+    SWAP 0, 2
 %endif
-    shufps     xmm0, xmm0, 0
+    shufps      xm0, xm0, 0
 %if cpuflag(avx)
-    vinsertf128  m0, m0, xmm0, 1
+    vinsertf128  m0, m0, xm0, 1
 %endif
 %endif
     lea    lenq, [lend*4-64]
 .loop:
-%assign a 0
-%rep 32/mmsize
-    mulps    m1, m0, [srcq+lenq+(a+0)*mmsize]
-    mulps    m2, m0, [srcq+lenq+(a+1)*mmsize]
-    addps    m1, m1, [dstq+lenq+(a+0)*mmsize]
-    addps    m2, m2, [dstq+lenq+(a+1)*mmsize]
-    mova  [dstq+lenq+(a+0)*mmsize], m1
-    mova  [dstq+lenq+(a+1)*mmsize], m2
-%assign a a+2
-%endrep
+%if cpuflag(fma3)
+    mova     m1,     [dstq+lenq]
+    mova     m2,     [dstq+lenq+1*mmsize]
+    fmaddps  m1, m0, [srcq+lenq], m1
+    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
+%else ; cpuflag
+    mulps    m1, m0, [srcq+lenq]
+    mulps    m2, m0, [srcq+lenq+1*mmsize]
+%if mmsize < 32
+    mulps    m3, m0, [srcq+lenq+2*mmsize]
+    mulps    m4, m0, [srcq+lenq+3*mmsize]
+%endif ; mmsize
+    addps    m1, m1, [dstq+lenq]
+    addps    m2, m2, [dstq+lenq+1*mmsize]
+%if mmsize < 32
+    addps    m3, m3, [dstq+lenq+2*mmsize]
+    addps    m4, m4, [dstq+lenq+3*mmsize]
+%endif ; mmsize
+%endif ; cpuflag
+    mova  [dstq+lenq], m1
+    mova  [dstq+lenq+1*mmsize], m2
+%if mmsize < 32
+    mova  [dstq+lenq+2*mmsize], m3
+    mova  [dstq+lenq+3*mmsize], m4
+%endif ; mmsize
     sub    lenq, 64
     jge .loop
     REP_RET
@@ -91,8 +110,14 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
 
 INIT_XMM sse
 VECTOR_FMAC_SCALAR
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 VECTOR_FMAC_SCALAR
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMAC_SCALAR
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
@@ -141,16 +166,11 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
     VBROADCASTSD   m0, mulm
 %else
 %if WIN64
-    movlhps      xmm2, xmm2
-%if cpuflag(avx)
-    vinsertf128  ymm2, ymm2, xmm2, 1
-%endif
     SWAP 0, 2
-%else
-    movlhps      xmm0, xmm0
-%if cpuflag(avx)
-    vinsertf128  ymm0, ymm0, xmm0, 1
 %endif
+    movlhps       xm0, xm0
+%if cpuflag(avx)
+    vinsertf128   ym0, ym0, xm0, 1
 %endif
 %endif
     lea          lenq, [lend*8-2*mmsize]
@@ -172,20 +192,85 @@ VECTOR_DMUL_SCALAR
 %endif
 
 ;-----------------------------------------------------------------------------
+; vector_fmul_window(float *dst, const float *src0,
+;                    const float *src1, const float *win, int len);
+;-----------------------------------------------------------------------------
+%macro VECTOR_FMUL_WINDOW 0
+cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
+    shl     lend, 2
+    lea    len1q, [lenq - mmsize]
+    add    src0q, lenq
+    add     dstq, lenq
+    add     winq, lenq
+    neg     lenq
+.loop:
+    mova      m0, [winq  + lenq]
+    mova      m4, [src0q + lenq]
+%if cpuflag(sse)
+    mova      m1, [winq  + len1q]
+    mova      m5, [src1q + len1q]
+    shufps    m1, m1, 0x1b
+    shufps    m5, m5, 0x1b
+    mova      m2, m0
+    mova      m3, m1
+    mulps     m2, m4
+    mulps     m3, m5
+    mulps     m1, m4
+    mulps     m0, m5
+    addps     m2, m3
+    subps     m1, m0
+    shufps    m2, m2, 0x1b
+%else
+    pswapd    m1, [winq  + len1q]
+    pswapd    m5, [src1q + len1q]
+    mova      m2, m0
+    mova      m3, m1
+    pfmul     m2, m4
+    pfmul     m3, m5
+    pfmul     m1, m4
+    pfmul     m0, m5
+    pfadd     m2, m3
+    pfsub     m1, m0
+    pswapd    m2, m2
+%endif
+    mova      [dstq + lenq], m1
+    mova      [dstq + len1q], m2
+    sub       len1q, mmsize
+    add       lenq,  mmsize
+    jl .loop
+%if mmsize == 8
+    femms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX 3dnowext
+VECTOR_FMUL_WINDOW
+INIT_XMM sse
+VECTOR_FMUL_WINDOW
+
+;-----------------------------------------------------------------------------
 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
 ;                 const float *src2, int len)
 ;-----------------------------------------------------------------------------
 %macro VECTOR_FMUL_ADD 0
-cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
+cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
     lea       lenq, [lend*4 - 2*mmsize]
 ALIGN 16
 .loop:
     mova    m0,   [src0q + lenq]
     mova    m1,   [src0q + lenq + mmsize]
+%if cpuflag(fma3)
+    mova    m2,     [src2q + lenq]
+    mova    m3,     [src2q + lenq + mmsize]
+    fmaddps m0, m0, [src1q + lenq], m2
+    fmaddps m1, m1, [src1q + lenq + mmsize], m3
+%else
     mulps   m0, m0, [src1q + lenq]
     mulps   m1, m1, [src1q + lenq + mmsize]
     addps   m0, m0, [src2q + lenq]
     addps   m1, m1, [src2q + lenq + mmsize]
+%endif
     mova    [dstq + lenq], m0
     mova    [dstq + lenq + mmsize], m1
 
@@ -196,8 +281,14 @@ ALIGN 16
 
 INIT_XMM sse
 VECTOR_FMUL_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 VECTOR_FMUL_ADD
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMUL_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
@@ -233,16 +324,18 @@ ALIGN 16
 
 INIT_XMM sse
 VECTOR_FMUL_REVERSE
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 VECTOR_FMUL_REVERSE
+%endif
 
 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
 INIT_XMM sse
 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
+    shl   offsetd, 2
+    add       v1q, offsetq
+    add       v2q, offsetq
     neg   offsetq
-    shl   offsetq, 2
-    sub       v1q, offsetq
-    sub       v2q, offsetq
     xorps    xmm0, xmm0
 .loop:
     movaps   xmm1, [v1q+offsetq]
@@ -266,14 +359,9 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
 ;-----------------------------------------------------------------------------
 INIT_XMM sse
 cglobal butterflies_float, 3,3,3, src0, src1, len
-%if ARCH_X86_64
-    movsxd    lenq, lend
-%endif
-    test      lenq, lenq
-    jz .end
-    shl       lenq, 2
-    lea      src0q, [src0q +   lenq]
-    lea      src1q, [src1q +   lenq]
+    shl       lend, 2
+    add      src0q, lenq
+    add      src1q, lenq
     neg       lenq
 .loop:
     mova        m0, [src0q + lenq]
@@ -284,5 +372,4 @@ cglobal butterflies_float, 3,3,3, src0, src1, len
     mova        [src0q + lenq], m0
     add       lenq, mmsize
     jl .loop
-.end:
     REP_RET
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index b70433031a..f211f2396b 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,8 @@ void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
                                int len);
 void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
                                int len);
+void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
+                                int len);
 
 void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
                                int len);
@@ -42,10 +44,17 @@ void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
 void ff_vector_dmul_scalar_avx(double *dst, const double *src,
                                double mul, int len);
 
+void ff_vector_fmul_window_3dnowext(float *dst, const float *src0,
+                                    const float *src1, const float *win, int len);
+void ff_vector_fmul_window_sse(float *dst, const float *src0,
+                               const float *src1, const float *win, int len);
+
 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
                             const float *src2, int len);
 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
                             const float *src2, int len);
+void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
+                             const float *src2, int len);
 
 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
                                 const float *src1, int len);
@@ -56,88 +65,18 @@ float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 
 void ff_butterflies_float_sse(float *src0, float *src1, int len);
 
-#if HAVE_6REGS && HAVE_INLINE_ASM
-static void vector_fmul_window_3dnowext(float *dst, const float *src0,
-                                        const float *src1, const float *win,
-                                        int len)
-{
-    x86_reg i = -len * 4;
-    x86_reg j =  len * 4 - 8;
-    __asm__ volatile (
-        "1:                             \n"
-        "pswapd (%5, %1), %%mm1         \n"
-        "movq   (%5, %0), %%mm0         \n"
-        "pswapd (%4, %1), %%mm5         \n"
-        "movq   (%3, %0), %%mm4         \n"
-        "movq      %%mm0, %%mm2         \n"
-        "movq      %%mm1, %%mm3         \n"
-        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
-        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
-        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
-        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
-        "pfadd     %%mm3, %%mm2         \n"
-        "pfsub     %%mm0, %%mm1         \n"
-        "pswapd    %%mm2, %%mm2         \n"
-        "movq      %%mm1, (%2, %0)      \n"
-        "movq      %%mm2, (%2, %1)      \n"
-        "sub          $8, %1            \n"
-        "add          $8, %0            \n"
-        "jl           1b                \n"
-        "femms                          \n"
-        : "+r"(i), "+r"(j)
-        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
-    );
-}
-
-static void vector_fmul_window_sse(float *dst, const float *src0,
-                                   const float *src1, const float *win, int len)
-{
-    x86_reg i = -len * 4;
-    x86_reg j =  len * 4 - 16;
-    __asm__ volatile (
-        "1:                             \n"
-        "movaps      (%5, %1), %%xmm1   \n"
-        "movaps      (%5, %0), %%xmm0   \n"
-        "movaps      (%4, %1), %%xmm5   \n"
-        "movaps      (%3, %0), %%xmm4   \n"
-        "shufps $0x1b, %%xmm1, %%xmm1   \n"
-        "shufps $0x1b, %%xmm5, %%xmm5   \n"
-        "movaps        %%xmm0, %%xmm2   \n"
-        "movaps        %%xmm1, %%xmm3   \n"
-        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
-        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
-        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
-        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
-        "addps         %%xmm3, %%xmm2   \n"
-        "subps         %%xmm0, %%xmm1   \n"
-        "shufps $0x1b, %%xmm2, %%xmm2   \n"
-        "movaps        %%xmm1, (%2, %0) \n"
-        "movaps        %%xmm2, (%2, %1) \n"
-        "sub              $16, %1       \n"
-        "add              $16, %0       \n"
-        "jl                1b           \n"
-        : "+r"(i), "+r"(j)
-        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
-    );
-}
-#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
-
 av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_6REGS && HAVE_INLINE_ASM
-    if (INLINE_AMD3DNOWEXT(cpu_flags)) {
-        fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
+    if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+        fdsp->vector_fmul_window = ff_vector_fmul_window_3dnowext;
     }
-    if (INLINE_SSE(cpu_flags)) {
-        fdsp->vector_fmul_window = vector_fmul_window_sse;
-    }
-#endif
     if (EXTERNAL_SSE(cpu_flags)) {
         fdsp->vector_fmul = ff_vector_fmul_sse;
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
         fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
+        fdsp->vector_fmul_window = ff_vector_fmul_window_sse;
         fdsp->vector_fmul_add    = ff_vector_fmul_add_sse;
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
         fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
@@ -153,4 +92,8 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
     }
+    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
+        fdsp->vector_fmul_add    = ff_vector_fmul_add_fma3;
+    }
 }
diff --git a/libavutil/x86/intmath.h b/libavutil/x86/intmath.h
new file mode 100644
index 0000000000..f58b0d08da
--- /dev/null
+++ b/libavutil/x86/intmath.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_INTMATH_H
+#define AVUTIL_X86_INTMATH_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#if HAVE_FAST_CLZ
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__INTEL_COMPILER)
+#include <immintrin.h>
+#endif
+#endif
+#include "config.h"
+
+#if HAVE_FAST_CLZ
+#if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER>=1216)) || defined(_MSC_VER)
+#   if defined(__INTEL_COMPILER)
+#       define ff_log2(x) (_bit_scan_reverse((x)|1))
+#   else
+#       define ff_log2 ff_log2_x86
+static av_always_inline av_const int ff_log2_x86(unsigned int v)
+{
+    unsigned long n;
+    _BitScanReverse(&n, v|1);
+    return n;
+}
+#   endif
+#   define ff_log2_16bit av_log2
+
+#   define ff_ctz(v) _tzcnt_u32(v)
+
+#   if ARCH_X86_64
+#       define ff_ctzll(v) _tzcnt_u64(v)
+#   else
+#       define ff_ctzll ff_ctzll_x86
+static av_always_inline av_const int ff_ctzll_x86(long long v)
+{
+    return ((uint32_t)v == 0) ? _tzcnt_u32((uint32_t)(v >> 32)) + 32 : _tzcnt_u32((uint32_t)v);
+}
+#   endif
+
+#endif /* __INTEL_COMPILER */
+
+#endif /* HAVE_FAST_CLZ */
+
+#if defined(__GNUC__)
+
+/* Our generic version of av_popcount is faster than GCC's built-in on
+ * CPUs that don't support the popcnt instruction.
+ */
+#if defined(__POPCNT__)
+    #define av_popcount   __builtin_popcount
+#if ARCH_X86_64
+    #define av_popcount64 __builtin_popcountll
+#endif
+
+#endif /* __POPCNT__ */
+
+#if defined(__BMI2__)
+
+#if AV_GCC_VERSION_AT_LEAST(5,1)
+#define av_mod_uintp2 __builtin_ia32_bzhi_si
+#elif HAVE_INLINE_ASM
+/* GCC releases before 5.1.0 have a broken bzhi builtin, so for those we
+ * implement it using inline assembly
+ */
+#define av_mod_uintp2 av_mod_uintp2_bmi2
+static av_always_inline av_const unsigned av_mod_uintp2_bmi2(unsigned a, unsigned p)
+{
+    if (av_builtin_constant_p(p))
+        return a & ((1 << p) - 1);
+    else {
+        unsigned x;
+        __asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p));
+        return x;
+    }
+}
+#endif /* AV_GCC_VERSION_AT_LEAST */
+
+#endif /* __BMI2__ */
+
+#if defined(__SSE2__) && !defined(__INTEL_COMPILER)
+
+#define av_clipd av_clipd_sse2
+static av_always_inline av_const double av_clipd_sse2(double a, double amin, double amax)
+{
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+    if (amin > amax) abort();
+#endif
+    __asm__ ("minsd %2, %0 \n\t"
+             "maxsd %1, %0 \n\t"
+             : "+&x"(a) : "xm"(amin), "xm"(amax));
+    return a;
+}
+
+#endif /* __SSE2__ */
+
+#if defined(__SSE__) && !defined(__INTEL_COMPILER)
+
+#define av_clipf av_clipf_sse
+static av_always_inline av_const float av_clipf_sse(float a, float amin, float amax)
+{
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+    if (amin > amax) abort();
+#endif
+    __asm__ ("minss %2, %0 \n\t"
+             "maxss %1, %0 \n\t"
+             : "+&x"(a) : "xm"(amin), "xm"(amax));
+    return a;
+}
+
+#endif /* __SSE__ */
+
+#endif /* __GNUC__ */
+
+#endif /* AVUTIL_X86_INTMATH_H */
diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
index 635096e569..4061d19231 100644
--- a/libavutil/x86/intreadwrite.h
+++ b/libavutil/x86/intreadwrite.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index eab85ed050..317fba6fca 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -3,20 +3,20 @@
 ;*
 ;* Copyright (c) 2013 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -125,7 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
 .ret:
     REP_RET
 
-INIT_YMM avx
+%macro UPDATE_LLS 0
 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     %define covarq ctxq
     mov  countd, [ctxq + LLSModel.indep_count]
@@ -139,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     vbroadcastsd ymm6, [varq + iq*8 + 16]
     vbroadcastsd ymm7, [varq + iq*8 + 24]
     vextractf128 xmm3, ymm1, 1
+%if cpuflag(fma3)
+    mova ymm0, COVAR(iq  ,0)
+    mova xmm2, COVAR(iq+2,2)
+    fmaddpd ymm0, ymm1, ymm4, ymm0
+    fmaddpd xmm2, xmm3, xmm6, xmm2
+    fmaddpd ymm1, ymm5, ymm1, COVAR(iq  ,1)
+    fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
+    mova COVAR(iq  ,0), ymm0
+    mova COVAR(iq  ,1), ymm1
+    mova COVAR(iq+2,2), xmm2
+    mova COVAR(iq+2,3), xmm3
+%else
     vmulpd  ymm0, ymm1, ymm4
     vmulpd  ymm1, ymm1, ymm5
     vmulpd  xmm2, xmm3, xmm6
@@ -147,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     ADDPD_MEM COVAR(iq  ,1), ymm1
     ADDPD_MEM COVAR(iq+2,2), xmm2
     ADDPD_MEM COVAR(iq+2,3), xmm3
+%endif ; cpuflag(fma3)
     lea     jd, [iq + 4]
     cmp     jd, count2d
     jg .skip4x4
 .loop4x4:
     ; Compute all 16 pairwise products of a 4x4 block
     mova    ymm3, [varq + jq*8]
+%if cpuflag(fma3)
+    mova ymm0, COVAR(jq, 0)
+    mova ymm1, COVAR(jq, 1)
+    mova ymm2, COVAR(jq, 2)
+    fmaddpd ymm0, ymm3, ymm4, ymm0
+    fmaddpd ymm1, ymm3, ymm5, ymm1
+    fmaddpd ymm2, ymm3, ymm6, ymm2
+    fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
+    mova COVAR(jq, 0), ymm0
+    mova COVAR(jq, 1), ymm1
+    mova COVAR(jq, 2), ymm2
+    mova COVAR(jq, 3), ymm3
+%else
     vmulpd  ymm0, ymm3, ymm4
     vmulpd  ymm1, ymm3, ymm5
     vmulpd  ymm2, ymm3, ymm6
@@ -161,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     ADDPD_MEM COVAR(jq,1), ymm1
     ADDPD_MEM COVAR(jq,2), ymm2
     ADDPD_MEM COVAR(jq,3), ymm3
+%endif ; cpuflag(fma3)
     add     jd, 4
     cmp     jd, count2d
     jle .loop4x4
@@ -168,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     cmp     jd, countd
     jg .skip2x4
     mova    xmm3, [varq + jq*8]
+%if cpuflag(fma3)
+    mova xmm0, COVAR(jq, 0)
+    mova xmm1, COVAR(jq, 1)
+    mova xmm2, COVAR(jq, 2)
+    fmaddpd xmm0, xmm3, xmm4, xmm0
+    fmaddpd xmm1, xmm3, xmm5, xmm1
+    fmaddpd xmm2, xmm3, xmm6, xmm2
+    fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
+    mova COVAR(jq, 0), xmm0
+    mova COVAR(jq, 1), xmm1
+    mova COVAR(jq, 2), xmm2
+    mova COVAR(jq, 3), xmm3
+%else
     vmulpd  xmm0, xmm3, xmm4
     vmulpd  xmm1, xmm3, xmm5
     vmulpd  xmm2, xmm3, xmm6
@@ -176,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     ADDPD_MEM COVAR(jq,1), xmm1
     ADDPD_MEM COVAR(jq,2), xmm2
     ADDPD_MEM COVAR(jq,3), xmm3
+%endif ; cpuflag(fma3)
 .skip2x4:
     add     id, 4
     add covarq, 4*COVAR_STRIDE
@@ -186,15 +227,30 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
     mov     jd, id
 .loop2x1:
     vmovddup xmm0, [varq + iq*8]
+%if cpuflag(fma3)
+    mova xmm1, [varq + jq*8]
+    fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
+    mova COVAR(jq,0), xmm0
+%else
     vmulpd   xmm0, [varq + jq*8]
     ADDPD_MEM COVAR(jq,0), xmm0
+%endif ; cpuflag(fma3)
     inc     id
     add covarq, COVAR_STRIDE
     cmp     id, countd
     jle .loop2x1
 .ret:
     REP_RET
+%endmacro ; UPDATE_LLS
 
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+UPDATE_LLS
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+UPDATE_LLS
+%endif
 
 INIT_XMM sse2
 cglobal evaluate_lls, 3,4,2, ctx, var, order, i
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 80cda29139..9f0d862b0e 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -3,29 +3,30 @@
  *
  * Copyright (c) 2013 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/lls.h"
 #include "libavutil/x86/cpu.h"
 
-void ff_update_lls_sse2(LLSModel *m, double *var);
-void ff_update_lls_avx(LLSModel *m, double *var);
-double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order);
+void ff_update_lls_sse2(LLSModel *m, const double *var);
+void ff_update_lls_avx(LLSModel *m, const double *var);
+void ff_update_lls_fma3(LLSModel *m, const double *var);
+double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
 
 av_cold void ff_init_lls_x86(LLSModel *m)
 {
@@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         m->update_lls = ff_update_lls_avx;
     }
+    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+        m->update_lls = ff_update_lls_fma3;
+    }
 }
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
new file mode 100644
index 0000000000..7af3007d0c
--- /dev/null
+++ b/libavutil/x86/pixelutils.asm
@@ -0,0 +1,165 @@
+;******************************************************************************
+;* Pixel utilities SIMD
+;*
+;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (C) 2014 Clément Bœsch <u pkh me>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION .text
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
+;                               const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmx
+cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
+    pxor        m7, m7
+    pxor        m6, m6
+%rep 4
+    mova        m0, [src1q]
+    mova        m2, [src1q + stride1q]
+    mova        m1, [src2q]
+    mova        m3, [src2q + stride2q]
+    psubusb     m4, m0, m1
+    psubusb     m5, m2, m3
+    psubusb     m1, m0
+    psubusb     m3, m2
+    por         m1, m4
+    por         m3, m5
+    punpcklbw   m0, m1, m7
+    punpcklbw   m2, m3, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m2
+    paddw       m6, m0
+    lea         src1q, [src1q + 2*stride1q]
+    lea         src2q, [src2q + 2*stride2q]
+%endrep
+    psrlq       m0, m6, 32
+    paddw       m6, m0
+    psrlq       m0, m6, 16
+    paddw       m6, m0
+    movd        eax, m6
+    movzx       eax, ax
+    RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+;                                  const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
+    pxor        m2, m2
+%rep 4
+    mova        m0, [src1q]
+    mova        m1, [src1q + stride1q]
+    psadbw      m0, [src2q]
+    psadbw      m1, [src2q + stride2q]
+    paddw       m2, m0
+    paddw       m2, m1
+    lea         src1q, [src1q + 2*stride1q]
+    lea         src2q, [src2q + 2*stride2q]
+%endrep
+    movd        eax, m2
+    RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+;                                    const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
+    pxor        m2, m2
+%rep 16
+    mova        m0, [src1q]
+    mova        m1, [src1q + 8]
+    psadbw      m0, [src2q]
+    psadbw      m1, [src2q + 8]
+    paddw       m2, m0
+    paddw       m2, m1
+    add         src1q, stride1q
+    add         src2q, stride2q
+%endrep
+    movd        eax, m2
+    RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_16x16_sse(const uint8_t *src1, ptrdiff_t stride1,
+;                                 const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
+    movu        m4, [src1q]
+    movu        m2, [src2q]
+    movu        m1, [src1q + stride1q]
+    movu        m3, [src2q + stride2q]
+    psadbw      m4, m2
+    psadbw      m1, m3
+    paddw       m4, m1
+%rep 7
+    lea         src1q, [src1q + 2*stride1q]
+    lea         src2q, [src2q + 2*stride2q]
+    movu        m0, [src1q]
+    movu        m2, [src2q]
+    movu        m1, [src1q + stride1q]
+    movu        m3, [src2q + stride2q]
+    psadbw      m0, m2
+    psadbw      m1, m3
+    paddw       m4, m0
+    paddw       m4, m1
+%endrep
+    movhlps     m0, m4
+    paddw       m4, m0
+    movd        eax, m4
+    RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_[au]_16x16_sse(const uint8_t *src1, ptrdiff_t stride1,
+;                                      const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+%macro SAD_XMM_16x16 1
+INIT_XMM sse2
+cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
+    mov%1       m2, [src2q]
+    psadbw      m2, [src1q]
+    mov%1       m1, [src2q + stride2q]
+    psadbw      m1, [src1q + stride1q]
+    paddw       m2, m1
+%rep 7
+    lea         src1q, [src1q + 2*stride1q]
+    lea         src2q, [src2q + 2*stride2q]
+    mov%1       m0, [src2q]
+    psadbw      m0, [src1q]
+    mov%1       m1, [src2q + stride2q]
+    psadbw      m1, [src1q + stride1q]
+    paddw       m2, m0
+    paddw       m2, m1
+%endrep
+    movhlps     m0, m2
+    paddw       m2, m0
+    movd        eax, m2
+    RET
+%endmacro
+
+SAD_XMM_16x16 a
+SAD_XMM_16x16 u
diff --git a/libavutil/x86/pixelutils.h b/libavutil/x86/pixelutils.h
new file mode 100644
index 0000000000..876cf46053
--- /dev/null
+++ b/libavutil/x86/pixelutils.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_PIXELUTILS_H
+#define AVUTIL_X86_PIXELUTILS_H
+
+#include "libavutil/pixelutils.h"
+
+void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned);
+
+#endif /* AVUTIL_X86_PIXELUTILS_H */
diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c
new file mode 100644
index 0000000000..c24a533aea
--- /dev/null
+++ b/libavutil/x86/pixelutils_init.c
@@ -0,0 +1,64 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "pixelutils.h"
+#include "cpu.h"
+
+int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
+                              const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+                                 const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                 const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+
+void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        sad[2] = ff_pixelutils_sad_8x8_mmx;
+    }
+
+    // The best way to use SSE2 would be to do 2 SADs in parallel,
+    // but we'd have to modify the pixelutils API to return SIMD functions.
+
+    // It's probably not faster to shuffle data around
+    // to get two lines of 8 pixels into a single 16byte register,
+    // so just use the MMX 8x8 version even when SSE2 is available.
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        sad[2] = ff_pixelutils_sad_8x8_mmxext;
+        sad[3] = ff_pixelutils_sad_16x16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        switch (aligned) {
+        case 0: sad[3] = ff_pixelutils_sad_16x16_sse2;   break; // src1 unaligned, src2 unaligned
+        case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1   aligned, src2 unaligned
+        case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1   aligned, src2   aligned
+        }
+    }
+}
diff --git a/libavutil/x86/timer.h b/libavutil/x86/timer.h
index bb7c341341..4d1e88def0 100644
--- a/libavutil/x86/timer.h
+++ b/libavutil/x86/timer.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #if HAVE_INLINE_ASM
 
+#define FF_TIMER_UNITS "decicycles"
 #define AV_READ_TIME read_time
 
 static inline uint64_t read_time(void)
diff --git a/libavutil/x86/w64xmmtest.h b/libavutil/x86/w64xmmtest.h
index b4ce7d3daf..a4a05b0419 100644
--- a/libavutil/x86/w64xmmtest.h
+++ b/libavutil/x86/w64xmmtest.h
@@ -2,23 +2,26 @@
  * check XMM registers for clobbers on Win64
  * Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#ifndef AVUTIL_X86_W64XMMTEST_H
+#define AVUTIL_X86_W64XMMTEST_H
+
 #include <inttypes.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -71,3 +74,5 @@
 int __real_ ## func;    \
 int __wrap_ ## func;    \
 int __wrap_ ## func
+
+#endif /* AVUTIL_X86_W64XMMTEST_H */
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 6ad9785536..3a0a2612ba 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2015 x264 project
+;* Copyright (C) 2005-2016 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
@@ -183,9 +183,9 @@
     %define e%1h %3
     %define r%1b %2
     %define e%1b %2
-%if ARCH_X86_64 == 0
-    %define r%1  e%1
-%endif
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
 %endmacro
 
 DECLARE_REG_SIZE ax, al, ah
@@ -295,7 +295,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 
 %macro ASSERT 1
     %if (%1) == 0
-        %error assert failed
+        %error assertion ``%1'' failed
     %endif
 %endmacro
 
@@ -386,8 +386,11 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
                 %assign regs_used (regs_used + 1)
-            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
-                %warning "Stack pointer will overwrite register argument"
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+                %assign regs_used 5 + UNIX64 * 3
             %endif
         %endif
     %endif
@@ -501,9 +504,9 @@ DECLARE_REG 14, R15, 120
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL rsp
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
@@ -540,17 +543,17 @@ DECLARE_REG 14, R15, 72
 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
@@ -596,29 +599,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
     POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
 
 %if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%macro WIN64_PUSH_XMM 0
-%endmacro
+    %macro WIN64_SPILL_XMM 1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 1
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
 %endif
 
 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
@@ -631,24 +634,26 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     %else
         rep ret
     %endif
+    annotate_function_size
 %endmacro
 
 %define last_branch_adr $$
 %macro AUTO_REP_RET 0
-    %ifndef cpuflags
-        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
-    %elif notcpuflag(ssse3)
-        times ((last_branch_adr-$)>>31)+1 rep
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
     %endif
     ret
+    annotate_function_size
 %endmacro
 
 %macro BRANCH_INSTR 0-*
     %rep %0
         %macro %1 1-2 %1
             %2 %1
-            %%branch_instr:
-            %xdefine last_branch_adr %%branch_instr
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
         %endmacro
         %rotate 1
     %endrep
@@ -663,6 +668,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %elif %2
         jmp %1
     %endif
+    annotate_function_size
 %endmacro
 
 ;=============================================================================
@@ -684,6 +690,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     cglobal_internal 0, %1 %+ SUFFIX, %2
 %endmacro
 %macro cglobal_internal 2-3+
+    annotate_function_size
     %if %1
         %xdefine %%FUNCTION_PREFIX private_prefix
         %xdefine %%VISIBILITY hidden
@@ -697,6 +704,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
+    %xdefine current_function_section __SECT__
     %if FORMAT_ELF
         global %2:function %%VISIBILITY
     %else
@@ -745,6 +753,24 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
 %endif
 
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
 ; cpuflags
 
 %assign cpuflags_mmx      (1<<0)
@@ -772,9 +798,11 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %assign cpuflags_atom     (1<<21)
 %assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
 %assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+%assign cpuflags_aesni    (1<<24)|cpuflags_sse42
 
-%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
 
 ; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
@@ -843,14 +871,14 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %define movnta movntq
     %assign %%i 0
     %rep 8
-    CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nnmm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, mm %+ %%i
+        CAT_XDEFINE nnmm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     %rep 8
-    CAT_UNDEF m, %%i
-    CAT_UNDEF nnmm, %%i
-    %assign %%i %%i+1
+        CAT_UNDEF m, %%i
+        CAT_UNDEF nnmm, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -861,7 +889,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %define mmsize 16
     %define num_mmregs 8
     %if ARCH_X86_64
-    %define num_mmregs 16
+        %define num_mmregs 16
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -869,9 +897,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
-    CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nnxmm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, xmm %+ %%i
+        CAT_XDEFINE nnxmm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -882,7 +910,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %define mmsize 32
     %define num_mmregs 8
     %if ARCH_X86_64
-    %define num_mmregs 16
+        %define num_mmregs 16
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -890,9 +918,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
-    CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nnymm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, ymm %+ %%i
+        CAT_XDEFINE nnymm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -916,7 +944,7 @@ INIT_XMM
 %assign i 0
 %rep 16
     DECLARE_MMCAST i
-%assign i i+1
+    %assign i i+1
 %endrep
 
 ; I often want to use macros that permute their arguments. e.g. there's no
@@ -934,23 +962,23 @@ INIT_XMM
 ; doesn't cost any cycles.
 
 %macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
-    %xdefine %%tmp%2 m%2
-    %rotate 2
-%endrep
-%rep %0/2
-    %xdefine m%1 %%tmp%2
-    CAT_XDEFINE nn, m%1, %1
-    %rotate 2
-%endrep
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
 %endmacro
 
 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-%ifnum %1 ; SWAP 0, 1, ...
-    SWAP_INTERNAL_NUM %1, %2
-%else ; SWAP m0, m1, ...
-    SWAP_INTERNAL_NAME %1, %2
-%endif
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
 %endmacro
 
 %macro SWAP_INTERNAL_NUM 2-*
@@ -960,7 +988,7 @@ INIT_XMM
         %xdefine m%2 %%tmp
         CAT_XDEFINE nn, m%1, %1
         CAT_XDEFINE nn, m%2, %2
-    %rotate 1
+        %rotate 1
     %endrep
 %endmacro
 
@@ -968,7 +996,7 @@ INIT_XMM
     %xdefine %%args nn %+ %1
     %rep %0-1
         %xdefine %%args %%args, nn %+ %2
-    %rotate 1
+        %rotate 1
     %endrep
     SWAP_INTERNAL_NUM %%args
 %endmacro
@@ -985,7 +1013,7 @@ INIT_XMM
     %assign %%i 0
     %rep num_mmregs
         CAT_XDEFINE %%f, %%i, m %+ %%i
-    %assign %%i %%i+1
+        %assign %%i %%i+1
     %endrep
 %endmacro
 
@@ -995,7 +1023,7 @@ INIT_XMM
         %rep num_mmregs
             CAT_XDEFINE m, %%i, %1_m %+ %%i
             CAT_XDEFINE nn, m %+ %%i, %%i
-        %assign %%i %%i+1
+            %assign %%i %%i+1
         %endrep
     %endif
 %endmacro
@@ -1051,7 +1079,7 @@ INIT_XMM
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
+    %assign i i+1
 %endrep
 %undef i
 
@@ -1170,12 +1198,12 @@ AVX_INSTR addsd, sse2, 1, 0, 1
 AVX_INSTR addss, sse, 1, 0, 1
 AVX_INSTR addsubpd, sse3, 1, 0, 0
 AVX_INSTR addsubps, sse3, 1, 0, 0
-AVX_INSTR aesdec, fnord, 0, 0, 0
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
-AVX_INSTR aesenc, fnord, 0, 0, 0
-AVX_INSTR aesenclast, fnord, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
 AVX_INSTR andnpd, sse2, 1, 0, 0
 AVX_INSTR andnps, sse, 1, 0, 0
 AVX_INSTR andpd, sse2, 1, 0, 1
@@ -1428,7 +1456,7 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1
     %else
         CAT_XDEFINE q, j, i
     %endif
-%assign i i+1
+    %assign i i+1
 %endrep
 %undef i
 %undef j
@@ -1455,47 +1483,44 @@ FMA_INSTR pmadcswd, pmaddwd, paddd
 ; This lets us use tzcnt without bumping the yasm version requirement yet.
 %define tzcnt rep bsf
 
-; convert FMA4 to FMA3 if possible
-%macro FMA4_INSTR 4
-    %macro %1 4-8 %1, %2, %3, %4
-        %if cpuflag(fma4)
-            v%5 %1, %2, %3, %4
-        %elifidn %1, %2
-            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
-        %elifidn %1, %3
-            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
-        %elifidn %1, %4
-            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
-        %else
-            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifid %3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
 %endmacro
 
-FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
-FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
-FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
-FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
-
-FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
-FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
-FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
-FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
-
-FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
-FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
-FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
-FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
-
-FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
-FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
-FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
-FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
-
-FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
-FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
-FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
-FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
 
 ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
 %ifdef __YASM_VER__
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 9f64dd13e1..b09fa813e2 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -6,20 +6,20 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -69,6 +69,15 @@
     SWAP %2, %3
 %endmacro
 
+%macro TRANSPOSE2x4x4B 5
+    SBUTTERFLY bw,  %1, %2, %5
+    SBUTTERFLY bw,  %3, %4, %5
+    SBUTTERFLY wd,  %1, %3, %5
+    SBUTTERFLY wd,  %2, %4, %5
+    SBUTTERFLY dq,  %1, %2, %5
+    SBUTTERFLY dq,  %3, %4, %5
+%endmacro
+
 %macro TRANSPOSE2x4x4W 5
     SBUTTERFLY wd,  %1, %2, %5
     SBUTTERFLY wd,  %3, %4, %5
@@ -98,6 +107,43 @@
     SWAP %5, %2, %3
 %endmacro
 
+%macro TRANSPOSE8x4D 9-11
+%if ARCH_X86_64
+    SBUTTERFLY dq,  %1, %2, %9
+    SBUTTERFLY dq,  %3, %4, %9
+    SBUTTERFLY dq,  %5, %6, %9
+    SBUTTERFLY dq,  %7, %8, %9
+    SBUTTERFLY qdq, %1, %3, %9
+    SBUTTERFLY qdq, %2, %4, %9
+    SBUTTERFLY qdq, %5, %7, %9
+    SBUTTERFLY qdq, %6, %8, %9
+    SWAP %2, %5
+    SWAP %4, %7
+%else
+; in:  m0..m7
+; out: m0..m7, unless %11 in which case m2 is in %9
+; spills into %9 and %10
+    movdqa %9, m%7
+    SBUTTERFLY dq,  %1, %2, %7
+    movdqa %10, m%2
+    movdqa m%7, %9
+    SBUTTERFLY dq,  %3, %4, %2
+    SBUTTERFLY dq,  %5, %6, %2
+    SBUTTERFLY dq,  %7, %8, %2
+    SBUTTERFLY qdq, %1, %3, %2
+    movdqa %9, m%3
+    movdqa m%2, %10
+    SBUTTERFLY qdq, %2, %4, %3
+    SBUTTERFLY qdq, %5, %7, %3
+    SBUTTERFLY qdq, %6, %8, %3
+    SWAP %2, %5
+    SWAP %4, %7
+%if %0<11
+    movdqa m%3, %9
+%endif
+%endif
+%endmacro
+
 %macro TRANSPOSE8x8W 9-11
 %if ARCH_X86_64
     SBUTTERFLY wd,  %1, %2, %9
@@ -164,13 +210,13 @@
 %endif
 %endmacro
 
-%macro PSIGNW_MMX 2
+%macro PSIGNW 2
+%if cpuflag(ssse3)
+    psignw     %1, %2
+%else
     pxor       %1, %2
     psubw      %1, %2
-%endmacro
-
-%macro PSIGNW_SSSE3 2
-    psignw     %1, %2
+%endif
 %endmacro
 
 %macro ABS1 2
@@ -273,6 +319,44 @@
 %endif
 %endmacro
 
+%macro HADDD 2 ; sum junk
+%if sizeof%1 == 32
+%define %2 xmm%2
+    vextracti128 %2, %1, 1
+%define %1 xmm%1
+    paddd   %1, %2
+%endif
+%if mmsize >= 16
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadddq %1, %1
+%endif
+    movhlps %2, %1
+    paddd   %1, %2
+%endif
+%if notcpuflag(xop) || sizeof%1 != 16
+%if cpuflag(mmxext)
+    PSHUFLW %2, %1, q0032
+%else ; mmx
+    mova    %2, %1
+    psrlq   %2, 32
+%endif
+    paddd   %1, %2
+%endif
+%undef %1
+%undef %2
+%endmacro
+
+%macro HADDW 2 ; reg, tmp
+%if cpuflag(xop) && sizeof%1 == 16
+    vphaddwq  %1, %1
+    movhlps   %2, %1
+    paddd     %1, %2
+%else
+    pmaddwd %1, [pw_1]
+    HADDD   %1, %2
+%endif
+%endmacro
+
 %macro PALIGNR 4-5
 %if cpuflag(ssse3)
 %if %0==5
@@ -302,11 +386,19 @@
 %endif
 %endmacro
 
-%macro PAVGB 2
+%macro PAVGB 2-4
 %if cpuflag(mmxext)
     pavgb   %1, %2
 %elif cpuflag(3dnow)
     pavgusb %1, %2
+%elif cpuflag(mmx)
+    movu   %3, %2
+    por    %3, %1
+    pxor   %1, %2
+    pand   %1, %4
+    psrlq  %1, 1
+    psubb  %3, %1
+    SWAP   %1, %3
 %endif
 %endmacro
 
@@ -552,7 +644,9 @@
 %endmacro
 
 %macro SPLATW 2-3 0
-%if mmsize == 16
+%if cpuflag(avx2) && %3 == 0
+    vpbroadcastw %1, %2
+%elif mmsize == 16
     pshuflw    %1, %2, (%3)*0x55
     punpcklqdq %1, %1
 %elif cpuflag(mmxext)
@@ -683,3 +777,19 @@
         addps   %1, %4
     %endif
 %endmacro
+
+%macro LSHIFT 2
+%if mmsize > 8
+    pslldq  %1, %2
+%else
+    psllq   %1, 8*(%2)
+%endif
+%endmacro
+
+%macro RSHIFT 2
+%if mmsize > 8
+    psrldq  %1, %2
+%else
+    psrlq   %1, 8*(%2)
+%endif
+%endmacro